mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-15 02:05:33 +00:00
Merge branch 'for-next' of git://git.kernel.dk/linux-block.git
This commit is contained in:
commit
e8f6787ce4
@ -15,6 +15,7 @@ PCI Endpoint Framework
|
||||
pci-ntb-howto
|
||||
pci-vntb-function
|
||||
pci-vntb-howto
|
||||
pci-nvme-function
|
||||
|
||||
function/binding/pci-test
|
||||
function/binding/pci-ntb
|
||||
|
13
Documentation/PCI/endpoint/pci-nvme-function.rst
Normal file
13
Documentation/PCI/endpoint/pci-nvme-function.rst
Normal file
@ -0,0 +1,13 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================
|
||||
PCI NVMe Function
|
||||
=================
|
||||
|
||||
:Author: Damien Le Moal <dlemoal@kernel.org>
|
||||
|
||||
The PCI NVMe endpoint function implements a PCI NVMe controller using the NVMe
|
||||
subsystem target core code. The driver for this function resides with the NVMe
|
||||
subsystem as drivers/nvme/target/nvmet-pciep.c.
|
||||
|
||||
See Documentation/nvme/nvme-pci-endpoint-target.rst for more details.
|
12
Documentation/nvme/index.rst
Normal file
12
Documentation/nvme/index.rst
Normal file
@ -0,0 +1,12 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============
|
||||
NVMe Subsystem
|
||||
==============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:numbered:
|
||||
|
||||
feature-and-quirk-policy
|
||||
nvme-pci-endpoint-target
|
368
Documentation/nvme/nvme-pci-endpoint-target.rst
Normal file
368
Documentation/nvme/nvme-pci-endpoint-target.rst
Normal file
@ -0,0 +1,368 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================================
|
||||
NVMe PCI Endpoint Function Target
|
||||
=================================
|
||||
|
||||
:Author: Damien Le Moal <dlemoal@kernel.org>
|
||||
|
||||
The NVMe PCI endpoint function target driver implements a NVMe PCIe controller
|
||||
using a NVMe fabrics target controller configured with the PCI transport type.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The NVMe PCI endpoint function target driver allows exposing a NVMe target
|
||||
controller over a PCIe link, thus implementing an NVMe PCIe device similar to a
|
||||
regular M.2 SSD. The target controller is created in the same manner as when
|
||||
using NVMe over fabrics: the controller represents the interface to an NVMe
|
||||
subsystem using a port. The port transfer type must be configured to be
|
||||
"pci". The subsystem can be configured to have namespaces backed by regular
|
||||
files or block devices, or can use NVMe passthrough to expose to the PCI host an
|
||||
existing physical NVMe device or a NVMe fabrics host controller (e.g. a NVMe TCP
|
||||
host controller).
|
||||
|
||||
The NVMe PCI endpoint function target driver relies as much as possible on the
|
||||
NVMe target core code to parse and execute NVMe commands submitted by the PCIe
|
||||
host. However, using the PCI endpoint framework API and DMA API, the driver is
|
||||
also responsible for managing all data transfers over the PCIe link. This
|
||||
implies that the NVMe PCI endpoint function target driver implements several
|
||||
NVMe data structure management and some NVMe command parsing.
|
||||
|
||||
1) The driver manages retrieval of NVMe commands in submission queues using DMA
|
||||
if supported, or MMIO otherwise. Each command retrieved is then executed
|
||||
using a work item to maximize performance with the parallel execution of
|
||||
multiple commands on different CPUs. The driver uses a work item to
|
||||
constantly poll the doorbell of all submission queues to detect command
|
||||
submissions from the PCIe host.
|
||||
|
||||
2) The driver transfers completion queues entries of completed commands to the
|
||||
PCIe host using MMIO copy of the entries in the host completion queue.
|
||||
After posting completion entries in a completion queue, the driver uses the
|
||||
PCI endpoint framework API to raise an interrupt to the host to signal the
|
||||
commands completion.
|
||||
|
||||
3) For any command that has a data buffer, the NVMe PCI endpoint target driver
|
||||
parses the command PRPs or SGLs lists to create a list of PCI address
|
||||
segments representing the mapping of the command data buffer on the host.
|
||||
The command data buffer is transferred over the PCIe link using this list of
|
||||
PCI address segments using DMA, if supported. If DMA is not supported, MMIO
|
||||
is used, which results in poor performance. For write commands, the command
|
||||
data buffer is transferred from the host into a local memory buffer before
|
||||
executing the command using the target core code. For read commands, a local
|
||||
memory buffer is allocated to execute the command and the content of that
|
||||
buffer is transferred to the host once the command completes.
|
||||
|
||||
Controller Capabilities
|
||||
-----------------------
|
||||
|
||||
The NVMe capabilities exposed to the PCIe host through the BAR 0 registers
|
||||
are almost identical to the capabilities of the NVMe target controller
|
||||
implemented by the target core code. There are some exceptions.
|
||||
|
||||
1) The NVMe PCI endpoint target driver always sets the controller capability
|
||||
CQR bit to request "Contiguous Queues Required". This is to facilitate the
|
||||
mapping of a queue PCI address range to the local CPU address space.
|
||||
|
||||
2) The doorbell stride (DSTRB) is always set to be 4B
|
||||
|
||||
3) Since the PCI endpoint framework does not provide a way to handle PCI level
|
||||
resets, the controller capability NSSR bit (NVM Subsystem Reset Supported)
|
||||
is always cleared.
|
||||
|
||||
4) The boot partition support (BPS), Persistent Memory Region Supported (PMRS)
|
||||
and Controller Memory Buffer Supported (CMBS) capabilities are never
|
||||
reported.
|
||||
|
||||
Supported Features
|
||||
------------------
|
||||
|
||||
The NVMe PCI endpoint target driver implements support for both PRPs and SGLs.
|
||||
The driver also implements IRQ vector coalescing and submission queue
|
||||
arbitration burst.
|
||||
|
||||
The maximum number of queues and the maximum data transfer size (MDTS) are
|
||||
configurable through configfs before starting the controller. To avoid issues
|
||||
with excessive local memory usage for executing commands, MDTS defaults to 512
|
||||
KB and is limited to a maximum of 2 MB (arbitrary limit).
|
||||
|
||||
Mimimum number of PCI Address Mapping Windows Required
|
||||
------------------------------------------------------
|
||||
|
||||
Most PCI endpoint controllers provide a limited number of mapping windows for
|
||||
mapping a PCI address range to local CPU memory addresses. The NVMe PCI
|
||||
endpoint target controllers uses mapping windows for the following.
|
||||
|
||||
1) One memory window for raising MSI or MSI-X interrupts
|
||||
2) One memory window for MMIO transfers
|
||||
3) One memory window for each completion queue
|
||||
|
||||
Given the highly asynchronous nature of the NVMe PCI endpoint target driver
|
||||
operation, the memory windows as described above will generally not be used
|
||||
simultaneously, but that may happen. So a safe maximum number of completion
|
||||
queues that can be supported is equal to the total number of memory mapping
|
||||
windows of the PCI endpoint controller minus two. E.g. for an endpoint PCI
|
||||
controller with 32 outbound memory windows available, up to 30 completion
|
||||
queues can be safely operated without any risk of getting PCI address mapping
|
||||
errors due to the lack of memory windows.
|
||||
|
||||
Maximum Number of Queue Pairs
|
||||
-----------------------------
|
||||
|
||||
Upon binding of the NVMe PCI endpoint target driver to the PCI endpoint
|
||||
controller, BAR 0 is allocated with enough space to accommodate the admin queue
|
||||
and multiple I/O queues. The maximum of number of I/O queues pairs that can be
|
||||
supported is limited by several factors.
|
||||
|
||||
1) The NVMe target core code limits the maximum number of I/O queues to the
|
||||
number of online CPUs.
|
||||
2) The total number of queue pairs, including the admin queue, cannot exceed
|
||||
the number of MSI-X or MSI vectors available.
|
||||
3) The total number of completion queues must not exceed the total number of
|
||||
PCI mapping windows minus 2 (see above).
|
||||
|
||||
The NVMe endpoint function driver allows configuring the maximum number of
|
||||
queue pairs through configfs.
|
||||
|
||||
Limitations and NVMe Specification Non-Compliance
|
||||
-------------------------------------------------
|
||||
|
||||
Similar to the NVMe target core code, the NVMe PCI endpoint target driver does
|
||||
not support multiple submission queues using the same completion queue. All
|
||||
submission queues must specify a unique completion queue.
|
||||
|
||||
|
||||
User Guide
|
||||
==========
|
||||
|
||||
This section describes the hardware requirements and how to setup an NVMe PCI
|
||||
endpoint target device.
|
||||
|
||||
Kernel Requirements
|
||||
-------------------
|
||||
|
||||
The kernel must be compiled with the configuration options CONFIG_PCI_ENDPOINT,
|
||||
CONFIG_PCI_ENDPOINT_CONFIGFS, and CONFIG_NVME_TARGET_PCI_EPF enabled.
|
||||
CONFIG_PCI, CONFIG_BLK_DEV_NVME and CONFIG_NVME_TARGET must also be enabled
|
||||
(obviously).
|
||||
|
||||
In addition to this, at least one PCI endpoint controller driver should be
|
||||
available for the endpoint hardware used.
|
||||
|
||||
To facilitate testing, enabling the null-blk driver (CONFIG_BLK_DEV_NULL_BLK)
|
||||
is also recommended. With this, a simple setup using a null_blk block device
|
||||
as a subsystem namespace can be used.
|
||||
|
||||
Hardware Requirements
|
||||
---------------------
|
||||
|
||||
To use the NVMe PCI endpoint target driver, at least one endpoint controller
|
||||
device is required.
|
||||
|
||||
To find the list of endpoint controller devices in the system::
|
||||
|
||||
# ls /sys/class/pci_epc/
|
||||
a40000000.pcie-ep
|
||||
|
||||
If PCI_ENDPOINT_CONFIGFS is enabled::
|
||||
|
||||
# ls /sys/kernel/config/pci_ep/controllers
|
||||
a40000000.pcie-ep
|
||||
|
||||
The endpoint board must of course also be connected to a host with a PCI cable
|
||||
with RX-TX signal swapped. If the host PCI slot used does not have
|
||||
plug-and-play capabilities, the host should be powered off when the NVMe PCI
|
||||
endpoint device is configured.
|
||||
|
||||
NVMe Endpoint Device
|
||||
--------------------
|
||||
|
||||
Creating an NVMe endpoint device is a two step process. First, an NVMe target
|
||||
subsystem and port must be defined. Second, the NVMe PCI endpoint device must
|
||||
be setup and bound to the subsystem and port created.
|
||||
|
||||
Creating a NVMe Subsystem and Port
|
||||
----------------------------------
|
||||
|
||||
Details about how to configure a NVMe target subsystem and port are outside the
|
||||
scope of this document. The following only provides a simple example of a port
|
||||
and subsystem with a single namespace backed by a null_blk device.
|
||||
|
||||
First, make sure that configfs is enabled::
|
||||
|
||||
# mount -t configfs none /sys/kernel/config
|
||||
|
||||
Next, create a null_blk device (default settings give a 250 GB device without
|
||||
memory backing). The block device created will be /dev/nullb0 by default::
|
||||
|
||||
# modprobe null_blk
|
||||
# ls /dev/nullb0
|
||||
/dev/nullb0
|
||||
|
||||
The NVMe PCI endpoint function target driver must be loaded::
|
||||
|
||||
# modprobe nvmet_pci_epf
|
||||
# lsmod | grep nvmet
|
||||
nvmet_pci_epf 32768 0
|
||||
nvmet 118784 1 nvmet_pci_epf
|
||||
nvme_core 131072 2 nvmet_pci_epf,nvmet
|
||||
|
||||
Now, create a subsystem and a port that we will use to create a PCI target
|
||||
controller when setting up the NVMe PCI endpoint target device. In this
|
||||
example, the port is created with a maximum of 4 I/O queue pairs::
|
||||
|
||||
# cd /sys/kernel/config/nvmet/subsystems
|
||||
# mkdir nvmepf.0.nqn
|
||||
# echo -n "Linux-pci-epf" > nvmepf.0.nqn/attr_model
|
||||
# echo "0x1b96" > nvmepf.0.nqn/attr_vendor_id
|
||||
# echo "0x1b96" > nvmepf.0.nqn/attr_subsys_vendor_id
|
||||
# echo 1 > nvmepf.0.nqn/attr_allow_any_host
|
||||
# echo 4 > nvmepf.0.nqn/attr_qid_max
|
||||
|
||||
Next, create and enable the subsystem namespace using the null_blk block
|
||||
device::
|
||||
|
||||
# mkdir nvmepf.0.nqn/namespaces/1
|
||||
# echo -n "/dev/nullb0" > nvmepf.0.nqn/namespaces/1/device_path
|
||||
# echo 1 > "nvmepf.0.nqn/namespaces/1/enable"
|
||||
|
||||
Finally, create the target port and link it to the subsystem::
|
||||
|
||||
# cd /sys/kernel/config/nvmet/ports
|
||||
# mkdir 1
|
||||
# echo -n "pci" > 1/addr_trtype
|
||||
# ln -s /sys/kernel/config/nvmet/subsystems/nvmepf.0.nqn \
|
||||
/sys/kernel/config/nvmet/ports/1/subsystems/nvmepf.0.nqn
|
||||
|
||||
Creating a NVMe PCI Endpoint Device
|
||||
-----------------------------------
|
||||
|
||||
With the NVMe target subsystem and port ready for use, the NVMe PCI endpoint
|
||||
device can now be created and enabled. The NVMe PCI endpoint target driver
|
||||
should already be loaded (that is done automatically when the port is created)::
|
||||
|
||||
# ls /sys/kernel/config/pci_ep/functions
|
||||
nvmet_pci_epf
|
||||
|
||||
Next, create function 0::
|
||||
|
||||
# cd /sys/kernel/config/pci_ep/functions/nvmet_pci_epf
|
||||
# mkdir nvmepf.0
|
||||
# ls nvmepf.0/
|
||||
baseclass_code msix_interrupts secondary
|
||||
cache_line_size nvme subclass_code
|
||||
deviceid primary subsys_id
|
||||
interrupt_pin progif_code subsys_vendor_id
|
||||
msi_interrupts revid vendorid
|
||||
|
||||
Configure the function using any device ID (the vendor ID for the device will
|
||||
be automatically set to the same value as the NVMe target subsystem vendor
|
||||
ID)::
|
||||
|
||||
# cd /sys/kernel/config/pci_ep/functions/nvmet_pci_epf
|
||||
# echo 0xBEEF > nvmepf.0/deviceid
|
||||
# echo 32 > nvmepf.0/msix_interrupts
|
||||
|
||||
If the PCI endpoint controller used does not support MSI-X, MSI can be
|
||||
configured instead::
|
||||
|
||||
# echo 32 > nvmepf.0/msi_interrupts
|
||||
|
||||
Next, let's bind our endpoint device with the target subsystem and port that we
|
||||
created::
|
||||
|
||||
# echo 1 > nvmepf.0/nvme/portid
|
||||
# echo "nvmepf.0.nqn" > nvmepf.0/nvme/subsysnqn
|
||||
|
||||
The endpoint function can then be bound to the endpoint controller and the
|
||||
controller started::
|
||||
|
||||
# cd /sys/kernel/config/pci_ep
|
||||
# ln -s functions/nvmet_pci_epf/nvmepf.0 controllers/a40000000.pcie-ep/
|
||||
# echo 1 > controllers/a40000000.pcie-ep/start
|
||||
|
||||
On the endpoint machine, kernel messages will show information as the NVMe
|
||||
target device and endpoint device are created and connected.
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
null_blk: disk nullb0 created
|
||||
null_blk: module loaded
|
||||
nvmet: adding nsid 1 to subsystem nvmepf.0.nqn
|
||||
nvmet_pci_epf nvmet_pci_epf.0: PCI endpoint controller supports MSI-X, 32 vectors
|
||||
nvmet: Created nvm controller 1 for subsystem nvmepf.0.nqn for NQN nqn.2014-08.org.nvmexpress:uuid:2ab90791-2246-4fbb-961d-4c3d5a5a0176.
|
||||
nvmet_pci_epf nvmet_pci_epf.0: New PCI ctrl "nvmepf.0.nqn", 4 I/O queues, mdts 524288 B
|
||||
|
||||
PCI Root-Complex Host
|
||||
---------------------
|
||||
|
||||
Booting the PCI host will result in the initialization of the PCIe link (this
|
||||
may be signaled by the PCI endpoint driver with a kernel message). A kernel
|
||||
message on the endpoint will also signal when the host NVMe driver enables the
|
||||
device controller::
|
||||
|
||||
nvmet_pci_epf nvmet_pci_epf.0: Enabling controller
|
||||
|
||||
On the host side, the NVMe PCI endpoint function target device will is
|
||||
discoverable as a PCI device, with the vendor ID and device ID as configured::
|
||||
|
||||
# lspci -n
|
||||
0000:01:00.0 0108: 1b96:beef
|
||||
|
||||
An this device will be recognized as an NVMe device with a single namespace::
|
||||
|
||||
# lsblk
|
||||
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS
|
||||
nvme0n1 259:0 0 250G 0 disk
|
||||
|
||||
The NVMe endpoint block device can then be used as any other regular NVMe
|
||||
namespace block device. The *nvme* command line utility can be used to get more
|
||||
detailed information about the endpoint device::
|
||||
|
||||
# nvme id-ctrl /dev/nvme0
|
||||
NVME Identify Controller:
|
||||
vid : 0x1b96
|
||||
ssvid : 0x1b96
|
||||
sn : 94993c85650ef7bcd625
|
||||
mn : Linux-pci-epf
|
||||
fr : 6.13.0-r
|
||||
rab : 6
|
||||
ieee : 000000
|
||||
cmic : 0xb
|
||||
mdts : 7
|
||||
cntlid : 0x1
|
||||
ver : 0x20100
|
||||
...
|
||||
|
||||
|
||||
Endpoint Bindings
|
||||
=================
|
||||
|
||||
The NVMe PCI endpoint target driver uses the PCI endpoint configfs device
|
||||
attributes as follows.
|
||||
|
||||
================ ===========================================================
|
||||
vendorid Ignored (the vendor id of the NVMe target subsystem is used)
|
||||
deviceid Anything is OK (e.g. PCI_ANY_ID)
|
||||
revid Do not care
|
||||
progif_code Must be 0x02 (NVM Express)
|
||||
baseclass_code Must be 0x01 (PCI_BASE_CLASS_STORAGE)
|
||||
subclass_code Must be 0x08 (Non-Volatile Memory controller)
|
||||
cache_line_size Do not care
|
||||
subsys_vendor_id Ignored (the subsystem vendor id of the NVMe target subsystem
|
||||
is used)
|
||||
subsys_id Anything is OK (e.g. PCI_ANY_ID)
|
||||
msi_interrupts At least equal to the number of queue pairs desired
|
||||
msix_interrupts At least equal to the number of queue pairs desired
|
||||
interrupt_pin Interrupt PIN to use if MSI and MSI-X are not supported
|
||||
================ ===========================================================
|
||||
|
||||
The NVMe PCI endpoint target function also has some specific configurable
|
||||
fields defined in the *nvme* subdirectory of the function directory. These
|
||||
fields are as follows.
|
||||
|
||||
================ ===========================================================
|
||||
mdts_kb Maximum data transfer size in KiB (default: 512)
|
||||
portid The ID of the target port to use
|
||||
subsysnqn The NQN of the target subsystem to use
|
||||
================ ===========================================================
|
@ -60,6 +60,7 @@ Storage interfaces
|
||||
cdrom/index
|
||||
scsi/index
|
||||
target/index
|
||||
nvme/index
|
||||
|
||||
Other subsystems
|
||||
----------------
|
||||
|
@ -865,7 +865,6 @@ static int ubd_add(int n, char **error_out)
|
||||
ubd_dev->tag_set.ops = &ubd_mq_ops;
|
||||
ubd_dev->tag_set.queue_depth = 64;
|
||||
ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
|
||||
ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
ubd_dev->tag_set.driver_data = ubd_dev;
|
||||
ubd_dev->tag_set.nr_hw_queues = 1;
|
||||
|
||||
|
@ -27,8 +27,6 @@ bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
|
||||
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
|
||||
|
||||
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
|
||||
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
|
||||
obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
|
||||
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
|
||||
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
|
||||
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
|
||||
|
@ -7622,7 +7622,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e,
|
||||
#define BFQ_ATTR(name) \
|
||||
__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
|
||||
|
||||
static struct elv_fs_entry bfq_attrs[] = {
|
||||
static const struct elv_fs_entry bfq_attrs[] = {
|
||||
BFQ_ATTR(fifo_expire_sync),
|
||||
BFQ_ATTR(fifo_expire_async),
|
||||
BFQ_ATTR(back_seek_max),
|
||||
|
@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
|
||||
|
||||
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
|
||||
{
|
||||
unsigned short nr_vecs = bip->bip_max_vcnt - 1;
|
||||
struct bio_vec *copy = &bip->bip_vec[1];
|
||||
size_t bytes = bip->bip_iter.bi_size;
|
||||
struct iov_iter iter;
|
||||
unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
|
||||
struct bio_vec *orig_bvecs = &bip->bip_vec[1];
|
||||
struct bio_vec *bounce_bvec = &bip->bip_vec[0];
|
||||
size_t bytes = bounce_bvec->bv_len;
|
||||
struct iov_iter orig_iter;
|
||||
int ret;
|
||||
|
||||
iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
|
||||
ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
|
||||
iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
|
||||
ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
|
||||
WARN_ON_ONCE(ret != bytes);
|
||||
|
||||
bio_integrity_unpin_bvec(copy, nr_vecs, true);
|
||||
bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
|
||||
return nr_bvecs;
|
||||
}
|
||||
|
||||
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
|
||||
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
|
||||
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
|
||||
size_t offset, bytes = iter->count;
|
||||
unsigned int direction, nr_bvecs;
|
||||
struct iov_iter iter;
|
||||
int ret, nr_vecs;
|
||||
size_t offset;
|
||||
bool copy;
|
||||
|
||||
if (bio_integrity(bio))
|
||||
@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
else
|
||||
direction = ITER_SOURCE;
|
||||
|
||||
iov_iter_ubuf(&iter, direction, ubuf, bytes);
|
||||
nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
|
||||
nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
|
||||
if (nr_vecs > BIO_MAX_VECS)
|
||||
return -E2BIG;
|
||||
if (nr_vecs > UIO_FASTIOV) {
|
||||
@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
pages = NULL;
|
||||
}
|
||||
|
||||
copy = !iov_iter_is_aligned(&iter, align, align);
|
||||
ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
|
||||
copy = !iov_iter_is_aligned(iter, align, align);
|
||||
ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
|
||||
if (unlikely(ret < 0))
|
||||
goto free_bvec;
|
||||
|
||||
@ -365,6 +364,55 @@ free_bvec:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta)
|
||||
{
|
||||
struct bio_integrity_payload *bip = bio_integrity(bio);
|
||||
|
||||
if (meta->flags & IO_INTEGRITY_CHK_GUARD)
|
||||
bip->bip_flags |= BIP_CHECK_GUARD;
|
||||
if (meta->flags & IO_INTEGRITY_CHK_APPTAG)
|
||||
bip->bip_flags |= BIP_CHECK_APPTAG;
|
||||
if (meta->flags & IO_INTEGRITY_CHK_REFTAG)
|
||||
bip->bip_flags |= BIP_CHECK_REFTAG;
|
||||
|
||||
bip->app_tag = meta->app_tag;
|
||||
}
|
||||
|
||||
int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
|
||||
{
|
||||
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
|
||||
unsigned int integrity_bytes;
|
||||
int ret;
|
||||
struct iov_iter it;
|
||||
|
||||
if (!bi)
|
||||
return -EINVAL;
|
||||
/*
|
||||
* original meta iterator can be bigger.
|
||||
* process integrity info corresponding to current data buffer only.
|
||||
*/
|
||||
it = meta->iter;
|
||||
integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio));
|
||||
if (it.count < integrity_bytes)
|
||||
return -EINVAL;
|
||||
|
||||
/* should fit into two bytes */
|
||||
BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16));
|
||||
|
||||
if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS))
|
||||
return -EINVAL;
|
||||
|
||||
it.count = integrity_bytes;
|
||||
ret = bio_integrity_map_user(bio, &it);
|
||||
if (!ret) {
|
||||
bio_uio_meta_to_bip(bio, meta);
|
||||
bip_set_seed(bio_integrity(bio), meta->seed);
|
||||
iov_iter_advance(&meta->iter, integrity_bytes);
|
||||
meta->seed += bio_integrity_intervals(bi, bio_sectors(bio));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_integrity_prep - Prepare bio for integrity I/O
|
||||
* @bio: bio to prepare
|
||||
@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio)
|
||||
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
|
||||
bip->bip_flags |= BIP_IP_CHECKSUM;
|
||||
|
||||
/* describe what tags to check in payload */
|
||||
if (bi->csum_type)
|
||||
bip->bip_flags |= BIP_CHECK_GUARD;
|
||||
if (bi->flags & BLK_INTEGRITY_REF_TAG)
|
||||
bip->bip_flags |= BIP_CHECK_REFTAG;
|
||||
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
|
||||
offset_in_page(buf)) < len) {
|
||||
printk(KERN_ERR "could not attach integrity payload\n");
|
||||
@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
|
||||
|
||||
bip->bip_vec = bip_src->bip_vec;
|
||||
bip->bip_iter = bip_src->bip_iter;
|
||||
bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
|
||||
bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS;
|
||||
bip->app_tag = bip_src->app_tag;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
107
block/bio.c
107
block/bio.c
@ -946,8 +946,11 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
|
||||
|
||||
/*
|
||||
* Try to merge a page into a segment, while obeying the hardware segment
|
||||
* size limit. This is not for normal read/write bios, but for passthrough
|
||||
* or Zone Append operations that we can't split.
|
||||
* size limit.
|
||||
*
|
||||
* This is kept around for the integrity metadata, which is still tries
|
||||
* to build the initial bio to the hardware limit and doesn't have proper
|
||||
* helpers to split. Hopefully this will go away soon.
|
||||
*/
|
||||
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
|
||||
struct page *page, unsigned len, unsigned offset,
|
||||
@ -964,106 +967,6 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
|
||||
return bvec_try_merge_page(bv, page, len, offset, same_page);
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_add_hw_page - attempt to add a page to a bio with hw constraints
|
||||
* @q: the target queue
|
||||
* @bio: destination bio
|
||||
* @page: page to add
|
||||
* @len: vec entry length
|
||||
* @offset: vec entry offset
|
||||
* @max_sectors: maximum number of sectors that can be added
|
||||
* @same_page: return if the segment has been merged inside the same page
|
||||
*
|
||||
* Add a page to a bio while respecting the hardware max_sectors, max_segment
|
||||
* and gap limitations.
|
||||
*/
|
||||
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
unsigned int max_sectors, bool *same_page)
|
||||
{
|
||||
unsigned int max_size = max_sectors << SECTOR_SHIFT;
|
||||
|
||||
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
|
||||
return 0;
|
||||
|
||||
len = min3(len, max_size, queue_max_segment_size(q));
|
||||
if (len > max_size - bio->bi_iter.bi_size)
|
||||
return 0;
|
||||
|
||||
if (bio->bi_vcnt > 0) {
|
||||
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
|
||||
if (bvec_try_merge_hw_page(q, bv, page, len, offset,
|
||||
same_page)) {
|
||||
bio->bi_iter.bi_size += len;
|
||||
return len;
|
||||
}
|
||||
|
||||
if (bio->bi_vcnt >=
|
||||
min(bio->bi_max_vecs, queue_max_segments(q)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* If the queue doesn't support SG gaps and adding this segment
|
||||
* would create a gap, disallow it.
|
||||
*/
|
||||
if (bvec_gap_to_prev(&q->limits, bv, offset))
|
||||
return 0;
|
||||
}
|
||||
|
||||
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
|
||||
bio->bi_vcnt++;
|
||||
bio->bi_iter.bi_size += len;
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_add_hw_folio - attempt to add a folio to a bio with hw constraints
|
||||
* @q: the target queue
|
||||
* @bio: destination bio
|
||||
* @folio: folio to add
|
||||
* @len: vec entry length
|
||||
* @offset: vec entry offset in the folio
|
||||
* @max_sectors: maximum number of sectors that can be added
|
||||
* @same_page: return if the segment has been merged inside the same folio
|
||||
*
|
||||
* Add a folio to a bio while respecting the hardware max_sectors, max_segment
|
||||
* and gap limitations.
|
||||
*/
|
||||
int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
|
||||
struct folio *folio, size_t len, size_t offset,
|
||||
unsigned int max_sectors, bool *same_page)
|
||||
{
|
||||
if (len > UINT_MAX || offset > UINT_MAX)
|
||||
return 0;
|
||||
return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset,
|
||||
max_sectors, same_page);
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_add_pc_page - attempt to add page to passthrough bio
|
||||
* @q: the target queue
|
||||
* @bio: destination bio
|
||||
* @page: page to add
|
||||
* @len: vec entry length
|
||||
* @offset: vec entry offset
|
||||
*
|
||||
* Attempt to add a page to the bio_vec maplist. This can fail for a
|
||||
* number of reasons, such as the bio being full or target block device
|
||||
* limitations. The target block device must allow bio's up to PAGE_SIZE,
|
||||
* so it is always possible to add a single page to an empty bio.
|
||||
*
|
||||
* This should only be used by passthrough bios.
|
||||
*/
|
||||
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset)
|
||||
{
|
||||
bool same_page = false;
|
||||
return bio_add_hw_page(q, bio, page, len, offset,
|
||||
queue_max_hw_sectors(q), &same_page);
|
||||
}
|
||||
EXPORT_SYMBOL(bio_add_pc_page);
|
||||
|
||||
/**
|
||||
* __bio_add_page - add page(s) to a bio in a new segment
|
||||
* @bio: destination bio
|
||||
|
@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
|
||||
/**
|
||||
* blkg_rwstat_add - add a value to a blkg_rwstat
|
||||
* @rwstat: target blkg_rwstat
|
||||
* @op: REQ_OP and flags
|
||||
* @opf: REQ_OP and flags
|
||||
* @val: value to add
|
||||
*
|
||||
* Add @val to @rwstat. The counters are chosen according to @rw. The
|
||||
@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
|
||||
/**
|
||||
* blkg_rwstat_read - read the current values of a blkg_rwstat
|
||||
* @rwstat: blkg_rwstat to read
|
||||
* @result: where to put the current values
|
||||
*
|
||||
* Read the current snapshot of @rwstat and return it in the aux counts.
|
||||
* Read the current snapshot of @rwstat and return it in the @result counts.
|
||||
*/
|
||||
static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
|
||||
struct blkg_rwstat_sample *result)
|
||||
|
@ -225,7 +225,9 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx);
|
||||
|
||||
/**
|
||||
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
|
||||
* @return: true if this bio needs to be submitted with the root blkg context.
|
||||
* @bio: the target &bio
|
||||
*
|
||||
* Return: true if this bio needs to be submitted with the root blkg context.
|
||||
*
|
||||
* In order to avoid priority inversions we sometimes need to issue a bio as if
|
||||
* it were attached to the root blkg, and then backcharge to the actual owning
|
||||
@ -245,7 +247,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio)
|
||||
* @q: request_queue of interest
|
||||
*
|
||||
* Lookup blkg for the @blkcg - @q pair.
|
||||
|
||||
*
|
||||
* Must be called in a RCU critical section.
|
||||
*/
|
||||
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
|
||||
@ -268,7 +270,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
|
||||
}
|
||||
|
||||
/**
|
||||
* blkg_to_pdata - get policy private data
|
||||
* blkg_to_pd - get policy private data
|
||||
* @blkg: blkg of interest
|
||||
* @pol: policy of interest
|
||||
*
|
||||
@ -287,7 +289,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
|
||||
}
|
||||
|
||||
/**
|
||||
* pdata_to_blkg - get blkg associated with policy private data
|
||||
* pd_to_blkg - get blkg associated with policy private data
|
||||
* @pd: policy private data of interest
|
||||
*
|
||||
* @pd is policy private data. Determine the blkg it's associated with.
|
||||
|
@ -629,8 +629,14 @@ static void __submit_bio(struct bio *bio)
|
||||
blk_mq_submit_bio(bio);
|
||||
} else if (likely(bio_queue_enter(bio) == 0)) {
|
||||
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
||||
|
||||
disk->fops->submit_bio(bio);
|
||||
|
||||
if ((bio->bi_opf & REQ_POLLED) &&
|
||||
!(disk->queue->limits.features & BLK_FEAT_POLL)) {
|
||||
bio->bi_status = BLK_STS_NOTSUPP;
|
||||
bio_endio(bio);
|
||||
} else {
|
||||
disk->fops->submit_bio(bio);
|
||||
}
|
||||
blk_queue_exit(disk->queue);
|
||||
}
|
||||
|
||||
@ -805,12 +811,6 @@ void submit_bio_noacct(struct bio *bio)
|
||||
}
|
||||
}
|
||||
|
||||
if (!(q->limits.features & BLK_FEAT_POLL) &&
|
||||
(bio->bi_opf & REQ_POLLED)) {
|
||||
bio_clear_polled(bio);
|
||||
goto not_supported;
|
||||
}
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_READ:
|
||||
break;
|
||||
@ -935,7 +935,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
|
||||
return 0;
|
||||
|
||||
q = bdev_get_queue(bdev);
|
||||
if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
|
||||
if (cookie == BLK_QC_T_NONE)
|
||||
return 0;
|
||||
|
||||
blk_flush_plug(current->plug, false);
|
||||
@ -956,7 +956,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
|
||||
} else {
|
||||
struct gendisk *disk = q->disk;
|
||||
|
||||
if (disk && disk->fops->poll_bio)
|
||||
if ((q->limits.features & BLK_FEAT_POLL) && disk &&
|
||||
disk->fops->poll_bio)
|
||||
ret = disk->fops->poll_bio(bio, iob, flags);
|
||||
}
|
||||
blk_queue_exit(q);
|
||||
|
@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
|
||||
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
|
||||
ssize_t bytes)
|
||||
{
|
||||
int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
|
||||
int ret;
|
||||
struct iov_iter iter;
|
||||
unsigned int direction;
|
||||
|
||||
if (op_is_write(req_op(rq)))
|
||||
direction = ITER_DEST;
|
||||
else
|
||||
direction = ITER_SOURCE;
|
||||
iov_iter_ubuf(&iter, direction, ubuf, bytes);
|
||||
ret = bio_integrity_map_user(rq->bio, &iter);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -218,9 +226,7 @@ static ssize_t flag_store(struct device *dev, const char *page, size_t count,
|
||||
else
|
||||
lim.integrity.flags |= flag;
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
err = queue_limits_commit_update(q, &lim);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
err = queue_limits_commit_update_frozen(q, &lim);
|
||||
if (err)
|
||||
return err;
|
||||
return count;
|
||||
|
128
block/blk-map.c
128
block/blk-map.c
@ -189,7 +189,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
|
||||
}
|
||||
}
|
||||
|
||||
if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) {
|
||||
if (bio_add_page(bio, page, bytes, offset) < bytes) {
|
||||
if (!map_data)
|
||||
__free_page(page);
|
||||
break;
|
||||
@ -272,86 +272,27 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq,
|
||||
static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
iov_iter_extraction_t extraction_flags = 0;
|
||||
unsigned int max_sectors = queue_max_hw_sectors(rq->q);
|
||||
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
int j;
|
||||
|
||||
if (!iov_iter_count(iter))
|
||||
return -EINVAL;
|
||||
|
||||
bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
|
||||
if (bio == NULL)
|
||||
if (!bio)
|
||||
return -ENOMEM;
|
||||
|
||||
if (blk_queue_pci_p2pdma(rq->q))
|
||||
extraction_flags |= ITER_ALLOW_P2PDMA;
|
||||
if (iov_iter_extract_will_pin(iter))
|
||||
bio_set_flag(bio, BIO_PAGE_PINNED);
|
||||
|
||||
while (iov_iter_count(iter)) {
|
||||
struct page *stack_pages[UIO_FASTIOV];
|
||||
struct page **pages = stack_pages;
|
||||
ssize_t bytes;
|
||||
size_t offs;
|
||||
int npages;
|
||||
|
||||
if (nr_vecs > ARRAY_SIZE(stack_pages))
|
||||
pages = NULL;
|
||||
|
||||
bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX,
|
||||
nr_vecs, extraction_flags, &offs);
|
||||
if (unlikely(bytes <= 0)) {
|
||||
ret = bytes ? bytes : -EFAULT;
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
|
||||
|
||||
if (unlikely(offs & queue_dma_alignment(rq->q)))
|
||||
j = 0;
|
||||
else {
|
||||
for (j = 0; j < npages; j++) {
|
||||
struct page *page = pages[j];
|
||||
unsigned int n = PAGE_SIZE - offs;
|
||||
bool same_page = false;
|
||||
|
||||
if (n > bytes)
|
||||
n = bytes;
|
||||
|
||||
if (!bio_add_hw_page(rq->q, bio, page, n, offs,
|
||||
max_sectors, &same_page))
|
||||
break;
|
||||
|
||||
if (same_page)
|
||||
bio_release_page(bio, page);
|
||||
bytes -= n;
|
||||
offs = 0;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* release the pages we didn't map into the bio, if any
|
||||
*/
|
||||
while (j < npages)
|
||||
bio_release_page(bio, pages[j++]);
|
||||
if (pages != stack_pages)
|
||||
kvfree(pages);
|
||||
/* couldn't stuff something into bio? */
|
||||
if (bytes) {
|
||||
iov_iter_revert(iter, bytes);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ret = bio_iov_iter_get_pages(bio, iter);
|
||||
if (ret)
|
||||
goto out_put;
|
||||
ret = blk_rq_append_bio(rq, bio);
|
||||
if (ret)
|
||||
goto out_unmap;
|
||||
goto out_release;
|
||||
return 0;
|
||||
|
||||
out_unmap:
|
||||
out_release:
|
||||
bio_release_pages(bio, false);
|
||||
out_put:
|
||||
blk_mq_map_bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
@ -422,8 +363,7 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data,
|
||||
page = virt_to_page(data);
|
||||
else
|
||||
page = vmalloc_to_page(data);
|
||||
if (bio_add_pc_page(q, bio, page, bytes,
|
||||
offset) < bytes) {
|
||||
if (bio_add_page(bio, page, bytes, offset) < bytes) {
|
||||
/* we don't support partial mappings */
|
||||
bio_uninit(bio);
|
||||
kfree(bio);
|
||||
@ -507,7 +447,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
|
||||
if (!reading)
|
||||
memcpy(page_address(page), p, bytes);
|
||||
|
||||
if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
|
||||
if (bio_add_page(bio, page, bytes, 0) < bytes)
|
||||
break;
|
||||
|
||||
len -= bytes;
|
||||
@ -536,24 +476,33 @@ cleanup:
|
||||
*/
|
||||
int blk_rq_append_bio(struct request *rq, struct bio *bio)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
const struct queue_limits *lim = &rq->q->limits;
|
||||
unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT;
|
||||
unsigned int nr_segs = 0;
|
||||
int ret;
|
||||
|
||||
bio_for_each_bvec(bv, bio, iter)
|
||||
nr_segs++;
|
||||
/* check that the data layout matches the hardware restrictions */
|
||||
ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes);
|
||||
if (ret) {
|
||||
/* if we would have to split the bio, copy instead */
|
||||
if (ret > 0)
|
||||
ret = -EREMOTEIO;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!rq->bio) {
|
||||
blk_rq_bio_prep(rq, bio, nr_segs);
|
||||
} else {
|
||||
if (rq->bio) {
|
||||
if (!ll_back_merge_fn(rq, bio, nr_segs))
|
||||
return -EINVAL;
|
||||
rq->biotail->bi_next = bio;
|
||||
rq->biotail = bio;
|
||||
rq->__data_len += (bio)->bi_iter.bi_size;
|
||||
rq->__data_len += bio->bi_iter.bi_size;
|
||||
bio_crypt_free_ctx(bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
rq->nr_phys_segments = nr_segs;
|
||||
rq->bio = rq->biotail = bio;
|
||||
rq->__data_len = bio->bi_iter.bi_size;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_rq_append_bio);
|
||||
@ -561,9 +510,7 @@ EXPORT_SYMBOL(blk_rq_append_bio);
|
||||
/* Prepare bio for passthrough IO given ITER_BVEC iter */
|
||||
static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
|
||||
{
|
||||
const struct queue_limits *lim = &rq->q->limits;
|
||||
unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT;
|
||||
unsigned int nsegs;
|
||||
unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT;
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
|
||||
@ -576,18 +523,10 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
|
||||
return -ENOMEM;
|
||||
bio_iov_bvec_set(bio, iter);
|
||||
|
||||
/* check that the data layout matches the hardware restrictions */
|
||||
ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes);
|
||||
if (ret) {
|
||||
/* if we would have to split the bio, copy instead */
|
||||
if (ret > 0)
|
||||
ret = -EREMOTEIO;
|
||||
ret = blk_rq_append_bio(rq, bio);
|
||||
if (ret)
|
||||
blk_mq_map_bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
blk_rq_bio_prep(rq, bio, nsegs);
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -644,8 +583,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
|
||||
ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask);
|
||||
else
|
||||
ret = bio_map_user_iov(rq, &i, gfp_mask);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
if (ret == -EREMOTEIO)
|
||||
ret = -EINVAL;
|
||||
goto unmap_rq;
|
||||
}
|
||||
if (!bio)
|
||||
bio = rq->bio;
|
||||
} while (iov_iter_count(&i));
|
||||
|
@ -473,6 +473,63 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
|
||||
return nr_phys_segs;
|
||||
}
|
||||
|
||||
struct phys_vec {
|
||||
phys_addr_t paddr;
|
||||
u32 len;
|
||||
};
|
||||
|
||||
static bool blk_map_iter_next(struct request *req,
|
||||
struct req_iterator *iter, struct phys_vec *vec)
|
||||
{
|
||||
unsigned int max_size;
|
||||
struct bio_vec bv;
|
||||
|
||||
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
||||
if (!iter->bio)
|
||||
return false;
|
||||
vec->paddr = bvec_phys(&req->special_vec);
|
||||
vec->len = req->special_vec.bv_len;
|
||||
iter->bio = NULL;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!iter->iter.bi_size)
|
||||
return false;
|
||||
|
||||
bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
|
||||
vec->paddr = bvec_phys(&bv);
|
||||
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
|
||||
bv.bv_len = min(bv.bv_len, max_size);
|
||||
bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
|
||||
|
||||
/*
|
||||
* If we are entirely done with this bi_io_vec entry, check if the next
|
||||
* one could be merged into it. This typically happens when moving to
|
||||
* the next bio, but some callers also don't pack bvecs tight.
|
||||
*/
|
||||
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
|
||||
struct bio_vec next;
|
||||
|
||||
if (!iter->iter.bi_size) {
|
||||
if (!iter->bio->bi_next)
|
||||
break;
|
||||
iter->bio = iter->bio->bi_next;
|
||||
iter->iter = iter->bio->bi_iter;
|
||||
}
|
||||
|
||||
next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
|
||||
if (bv.bv_len + next.bv_len > max_size ||
|
||||
!biovec_phys_mergeable(req->q, &bv, &next))
|
||||
break;
|
||||
|
||||
bv.bv_len += next.bv_len;
|
||||
bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
|
||||
}
|
||||
|
||||
vec->len = bv.bv_len;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
|
||||
struct scatterlist *sglist)
|
||||
{
|
||||
@ -490,120 +547,26 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
|
||||
return sg_next(*sg);
|
||||
}
|
||||
|
||||
static unsigned blk_bvec_map_sg(struct request_queue *q,
|
||||
struct bio_vec *bvec, struct scatterlist *sglist,
|
||||
struct scatterlist **sg)
|
||||
{
|
||||
unsigned nbytes = bvec->bv_len;
|
||||
unsigned nsegs = 0, total = 0;
|
||||
|
||||
while (nbytes > 0) {
|
||||
unsigned offset = bvec->bv_offset + total;
|
||||
unsigned len = get_max_segment_size(&q->limits,
|
||||
bvec_phys(bvec) + total, nbytes);
|
||||
struct page *page = bvec->bv_page;
|
||||
|
||||
/*
|
||||
* Unfortunately a fair number of drivers barf on scatterlists
|
||||
* that have an offset larger than PAGE_SIZE, despite other
|
||||
* subsystems dealing with that invariant just fine. For now
|
||||
* stick to the legacy format where we never present those from
|
||||
* the block layer, but the code below should be removed once
|
||||
* these offenders (mostly MMC/SD drivers) are fixed.
|
||||
*/
|
||||
page += (offset >> PAGE_SHIFT);
|
||||
offset &= ~PAGE_MASK;
|
||||
|
||||
*sg = blk_next_sg(sg, sglist);
|
||||
sg_set_page(*sg, page, len, offset);
|
||||
|
||||
total += len;
|
||||
nbytes -= len;
|
||||
nsegs++;
|
||||
}
|
||||
|
||||
return nsegs;
|
||||
}
|
||||
|
||||
static inline int __blk_bvec_map_sg(struct bio_vec bv,
|
||||
struct scatterlist *sglist, struct scatterlist **sg)
|
||||
{
|
||||
*sg = blk_next_sg(sg, sglist);
|
||||
sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* only try to merge bvecs into one sg if they are from two bios */
|
||||
static inline bool
|
||||
__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
|
||||
struct bio_vec *bvprv, struct scatterlist **sg)
|
||||
{
|
||||
|
||||
int nbytes = bvec->bv_len;
|
||||
|
||||
if (!*sg)
|
||||
return false;
|
||||
|
||||
if ((*sg)->length + nbytes > queue_max_segment_size(q))
|
||||
return false;
|
||||
|
||||
if (!biovec_phys_mergeable(q, bvprv, bvec))
|
||||
return false;
|
||||
|
||||
(*sg)->length += nbytes;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
|
||||
struct scatterlist *sglist,
|
||||
struct scatterlist **sg)
|
||||
{
|
||||
struct bio_vec bvec, bvprv = { NULL };
|
||||
struct bvec_iter iter;
|
||||
int nsegs = 0;
|
||||
bool new_bio = false;
|
||||
|
||||
for_each_bio(bio) {
|
||||
bio_for_each_bvec(bvec, bio, iter) {
|
||||
/*
|
||||
* Only try to merge bvecs from two bios given we
|
||||
* have done bio internal merge when adding pages
|
||||
* to bio
|
||||
*/
|
||||
if (new_bio &&
|
||||
__blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
|
||||
goto next_bvec;
|
||||
|
||||
if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
|
||||
nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
|
||||
else
|
||||
nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
|
||||
next_bvec:
|
||||
new_bio = false;
|
||||
}
|
||||
if (likely(bio->bi_iter.bi_size)) {
|
||||
bvprv = bvec;
|
||||
new_bio = true;
|
||||
}
|
||||
}
|
||||
|
||||
return nsegs;
|
||||
}
|
||||
|
||||
/*
|
||||
* map a request to scatterlist, return number of sg entries setup. Caller
|
||||
* must make sure sg can hold rq->nr_phys_segments entries
|
||||
* Map a request to scatterlist, return number of sg entries setup. Caller
|
||||
* must make sure sg can hold rq->nr_phys_segments entries.
|
||||
*/
|
||||
int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
|
||||
struct scatterlist *sglist, struct scatterlist **last_sg)
|
||||
{
|
||||
struct req_iterator iter = {
|
||||
.bio = rq->bio,
|
||||
.iter = rq->bio->bi_iter,
|
||||
};
|
||||
struct phys_vec vec;
|
||||
int nsegs = 0;
|
||||
|
||||
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
|
||||
nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
|
||||
else if (rq->bio)
|
||||
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
|
||||
while (blk_map_iter_next(rq, &iter, &vec)) {
|
||||
*last_sg = blk_next_sg(last_sg, sglist);
|
||||
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
|
||||
offset_in_page(vec.paddr));
|
||||
nsegs++;
|
||||
}
|
||||
|
||||
if (*last_sg)
|
||||
sg_mark_end(*last_sg);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/smp.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/group_cpus.h>
|
||||
#include <linux/device/bus.h>
|
||||
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
@ -54,3 +55,39 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
|
||||
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_mq_map_hw_queues - Create CPU to hardware queue mapping
|
||||
* @qmap: CPU to hardware queue map
|
||||
* @dev: The device to map queues
|
||||
* @offset: Queue offset to use for the device
|
||||
*
|
||||
* Create a CPU to hardware queue mapping in @qmap. The struct bus_type
|
||||
* irq_get_affinity callback will be used to retrieve the affinity.
|
||||
*/
|
||||
void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
|
||||
struct device *dev, unsigned int offset)
|
||||
|
||||
{
|
||||
const struct cpumask *mask;
|
||||
unsigned int queue, cpu;
|
||||
|
||||
if (!dev->bus->irq_get_affinity)
|
||||
goto fallback;
|
||||
|
||||
for (queue = 0; queue < qmap->nr_queues; queue++) {
|
||||
mask = dev->bus->irq_get_affinity(dev, queue + offset);
|
||||
if (!mask)
|
||||
goto fallback;
|
||||
|
||||
for_each_cpu(cpu, mask)
|
||||
qmap->mq_map[cpu] = qmap->queue_offset + queue;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
fallback:
|
||||
WARN_ON_ONCE(qmap->nr_queues > 1);
|
||||
blk_mq_clear_mq_map(qmap);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues);
|
||||
|
@ -172,21 +172,13 @@ static int hctx_state_show(void *data, struct seq_file *m)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name
|
||||
static const char *const alloc_policy_name[] = {
|
||||
BLK_TAG_ALLOC_NAME(FIFO),
|
||||
BLK_TAG_ALLOC_NAME(RR),
|
||||
};
|
||||
#undef BLK_TAG_ALLOC_NAME
|
||||
|
||||
#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name
|
||||
static const char *const hctx_flag_name[] = {
|
||||
HCTX_FLAG_NAME(SHOULD_MERGE),
|
||||
HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
|
||||
HCTX_FLAG_NAME(STACKING),
|
||||
HCTX_FLAG_NAME(TAG_HCTX_SHARED),
|
||||
HCTX_FLAG_NAME(BLOCKING),
|
||||
HCTX_FLAG_NAME(NO_SCHED),
|
||||
HCTX_FLAG_NAME(TAG_RR),
|
||||
HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
|
||||
};
|
||||
#undef HCTX_FLAG_NAME
|
||||
@ -194,22 +186,11 @@ static const char *const hctx_flag_name[] = {
|
||||
static int hctx_flags_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) !=
|
||||
BLK_MQ_F_ALLOC_POLICY_START_BIT);
|
||||
BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX);
|
||||
BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX));
|
||||
|
||||
seq_puts(m, "alloc_policy=");
|
||||
if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
|
||||
alloc_policy_name[alloc_policy])
|
||||
seq_puts(m, alloc_policy_name[alloc_policy]);
|
||||
else
|
||||
seq_printf(m, "%d", alloc_policy);
|
||||
seq_puts(m, " ");
|
||||
blk_flags_show(m,
|
||||
hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
|
||||
hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
|
||||
blk_flags_show(m, hctx->flags, hctx_flag_name,
|
||||
ARRAY_SIZE(hctx_flag_name));
|
||||
seq_puts(m, "\n");
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,46 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2016 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-mq-pci.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include "blk-mq.h"
|
||||
|
||||
/**
|
||||
* blk_mq_pci_map_queues - provide a default queue mapping for PCI device
|
||||
* @qmap: CPU to hardware queue map.
|
||||
* @pdev: PCI device associated with @set.
|
||||
* @offset: Offset to use for the pci irq vector
|
||||
*
|
||||
* This function assumes the PCI device @pdev has at least as many available
|
||||
* interrupt vectors as @set has queues. It will then query the vector
|
||||
* corresponding to each queue for it's affinity mask and built queue mapping
|
||||
* that maps a queue to the CPUs that have irq affinity for the corresponding
|
||||
* vector.
|
||||
*/
|
||||
void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
|
||||
int offset)
|
||||
{
|
||||
const struct cpumask *mask;
|
||||
unsigned int queue, cpu;
|
||||
|
||||
for (queue = 0; queue < qmap->nr_queues; queue++) {
|
||||
mask = pci_irq_get_affinity(pdev, queue + offset);
|
||||
if (!mask)
|
||||
goto fallback;
|
||||
|
||||
for_each_cpu(cpu, mask)
|
||||
qmap->mq_map[cpu] = qmap->queue_offset + queue;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
fallback:
|
||||
WARN_ON_ONCE(qmap->nr_queues > 1);
|
||||
blk_mq_clear_mq_map(qmap);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
|
@ -351,8 +351,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
ctx = blk_mq_get_ctx(q);
|
||||
hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
|
||||
type = hctx->type;
|
||||
if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
|
||||
list_empty_careful(&ctx->rq_lists[type]))
|
||||
if (list_empty_careful(&ctx->rq_lists[type]))
|
||||
goto out_put;
|
||||
|
||||
/* default per sw-queue merge */
|
||||
|
@ -544,30 +544,11 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
|
||||
node);
|
||||
}
|
||||
|
||||
int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
|
||||
struct sbitmap_queue *breserved_tags,
|
||||
unsigned int queue_depth, unsigned int reserved,
|
||||
int node, int alloc_policy)
|
||||
{
|
||||
unsigned int depth = queue_depth - reserved;
|
||||
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
|
||||
|
||||
if (bt_alloc(bitmap_tags, depth, round_robin, node))
|
||||
return -ENOMEM;
|
||||
if (bt_alloc(breserved_tags, reserved, round_robin, node))
|
||||
goto free_bitmap_tags;
|
||||
|
||||
return 0;
|
||||
|
||||
free_bitmap_tags:
|
||||
sbitmap_queue_free(bitmap_tags);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
|
||||
unsigned int reserved_tags,
|
||||
int node, int alloc_policy)
|
||||
unsigned int reserved_tags, unsigned int flags, int node)
|
||||
{
|
||||
unsigned int depth = total_tags - reserved_tags;
|
||||
bool round_robin = flags & BLK_MQ_F_TAG_RR;
|
||||
struct blk_mq_tags *tags;
|
||||
|
||||
if (total_tags > BLK_MQ_TAG_MAX) {
|
||||
@ -582,14 +563,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
|
||||
tags->nr_tags = total_tags;
|
||||
tags->nr_reserved_tags = reserved_tags;
|
||||
spin_lock_init(&tags->lock);
|
||||
if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
|
||||
goto out_free_tags;
|
||||
if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node))
|
||||
goto out_free_bitmap_tags;
|
||||
|
||||
if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
|
||||
total_tags, reserved_tags, node,
|
||||
alloc_policy) < 0) {
|
||||
kfree(tags);
|
||||
return NULL;
|
||||
}
|
||||
return tags;
|
||||
|
||||
out_free_bitmap_tags:
|
||||
sbitmap_queue_free(&tags->bitmap_tags);
|
||||
out_free_tags:
|
||||
kfree(tags);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void blk_mq_free_tags(struct blk_mq_tags *tags)
|
||||
|
@ -1,46 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2016 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/device.h>
|
||||
#include <linux/blk-mq-virtio.h>
|
||||
#include <linux/virtio_config.h>
|
||||
#include <linux/module.h>
|
||||
#include "blk-mq.h"
|
||||
|
||||
/**
|
||||
* blk_mq_virtio_map_queues - provide a default queue mapping for virtio device
|
||||
* @qmap: CPU to hardware queue map.
|
||||
* @vdev: virtio device to provide a mapping for.
|
||||
* @first_vec: first interrupt vectors to use for queues (usually 0)
|
||||
*
|
||||
* This function assumes the virtio device @vdev has at least as many available
|
||||
* interrupt vectors as @set has queues. It will then query the vector
|
||||
* corresponding to each queue for it's affinity mask and built queue mapping
|
||||
* that maps a queue to the CPUs that have irq affinity for the corresponding
|
||||
* vector.
|
||||
*/
|
||||
void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
|
||||
struct virtio_device *vdev, int first_vec)
|
||||
{
|
||||
const struct cpumask *mask;
|
||||
unsigned int queue, cpu;
|
||||
|
||||
if (!vdev->config->get_vq_affinity)
|
||||
goto fallback;
|
||||
|
||||
for (queue = 0; queue < qmap->nr_queues; queue++) {
|
||||
mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
|
||||
if (!mask)
|
||||
goto fallback;
|
||||
|
||||
for_each_cpu(cpu, mask)
|
||||
qmap->mq_map[cpu] = qmap->queue_offset + queue;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
fallback:
|
||||
blk_mq_map_queues(qmap);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
|
@ -131,6 +131,10 @@ static bool blk_freeze_set_owner(struct request_queue *q,
|
||||
if (!q->mq_freeze_depth) {
|
||||
q->mq_freeze_owner = owner;
|
||||
q->mq_freeze_owner_depth = 1;
|
||||
q->mq_freeze_disk_dead = !q->disk ||
|
||||
test_bit(GD_DEAD, &q->disk->state) ||
|
||||
!blk_queue_registered(q);
|
||||
q->mq_freeze_queue_dying = blk_queue_dying(q);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -142,8 +146,6 @@ static bool blk_freeze_set_owner(struct request_queue *q,
|
||||
/* verify the last unfreeze in owner context */
|
||||
static bool blk_unfreeze_check_owner(struct request_queue *q)
|
||||
{
|
||||
if (!q->mq_freeze_owner)
|
||||
return false;
|
||||
if (q->mq_freeze_owner != current)
|
||||
return false;
|
||||
if (--q->mq_freeze_owner_depth == 0) {
|
||||
@ -189,7 +191,7 @@ bool __blk_freeze_queue_start(struct request_queue *q,
|
||||
void blk_freeze_queue_start(struct request_queue *q)
|
||||
{
|
||||
if (__blk_freeze_queue_start(q, current))
|
||||
blk_freeze_acquire_lock(q, false, false);
|
||||
blk_freeze_acquire_lock(q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
|
||||
|
||||
@ -237,7 +239,7 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
|
||||
void blk_mq_unfreeze_queue(struct request_queue *q)
|
||||
{
|
||||
if (__blk_mq_unfreeze_queue(q, false))
|
||||
blk_unfreeze_release_lock(q, false, false);
|
||||
blk_unfreeze_release_lock(q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
|
||||
|
||||
@ -2656,8 +2658,10 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
|
||||
if (bio->bi_opf & REQ_RAHEAD)
|
||||
rq->cmd_flags |= REQ_FAILFAST_MASK;
|
||||
|
||||
rq->bio = rq->biotail = bio;
|
||||
rq->__sector = bio->bi_iter.bi_sector;
|
||||
blk_rq_bio_prep(rq, bio, nr_segs);
|
||||
rq->__data_len = bio->bi_iter.bi_size;
|
||||
rq->nr_phys_segments = nr_segs;
|
||||
if (bio_integrity(bio))
|
||||
rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
|
||||
bio);
|
||||
@ -3092,14 +3096,21 @@ void blk_mq_submit_bio(struct bio *bio)
|
||||
}
|
||||
|
||||
/*
|
||||
* Device reconfiguration may change logical block size, so alignment
|
||||
* check has to be done with queue usage counter held
|
||||
* Device reconfiguration may change logical block size or reduce the
|
||||
* number of poll queues, so the checks for alignment and poll support
|
||||
* have to be done with queue usage counter held.
|
||||
*/
|
||||
if (unlikely(bio_unaligned(bio, q))) {
|
||||
bio_io_error(bio);
|
||||
goto queue_exit;
|
||||
}
|
||||
|
||||
if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
|
||||
bio->bi_status = BLK_STS_NOTSUPP;
|
||||
bio_endio(bio);
|
||||
goto queue_exit;
|
||||
}
|
||||
|
||||
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
|
||||
if (!bio)
|
||||
goto queue_exit;
|
||||
@ -3472,8 +3483,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
|
||||
if (node == NUMA_NO_NODE)
|
||||
node = set->numa_node;
|
||||
|
||||
tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
|
||||
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
|
||||
tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
|
||||
if (!tags)
|
||||
return NULL;
|
||||
|
||||
@ -4317,12 +4327,6 @@ void blk_mq_release(struct request_queue *q)
|
||||
blk_mq_sysfs_deinit(q);
|
||||
}
|
||||
|
||||
static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
|
||||
{
|
||||
return set->nr_maps > HCTX_TYPE_POLL &&
|
||||
set->map[HCTX_TYPE_POLL].nr_queues;
|
||||
}
|
||||
|
||||
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
|
||||
struct queue_limits *lim, void *queuedata)
|
||||
{
|
||||
@ -4333,7 +4337,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
|
||||
if (!lim)
|
||||
lim = &default_lim;
|
||||
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
|
||||
if (blk_mq_can_poll(set))
|
||||
if (set->nr_maps > HCTX_TYPE_POLL)
|
||||
lim->features |= BLK_FEAT_POLL;
|
||||
|
||||
q = blk_alloc_queue(lim, set->numa_node);
|
||||
@ -5021,8 +5025,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
||||
fallback:
|
||||
blk_mq_update_queue_map(set);
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
struct queue_limits lim;
|
||||
|
||||
blk_mq_realloc_hw_ctxs(set, q);
|
||||
|
||||
if (q->nr_hw_queues != set->nr_hw_queues) {
|
||||
@ -5036,13 +5038,6 @@ fallback:
|
||||
set->nr_hw_queues = prev_nr_hw_queues;
|
||||
goto fallback;
|
||||
}
|
||||
lim = queue_limits_start_update(q);
|
||||
if (blk_mq_can_poll(set))
|
||||
lim.features |= BLK_FEAT_POLL;
|
||||
else
|
||||
lim.features &= ~BLK_FEAT_POLL;
|
||||
if (queue_limits_commit_update(q, &lim) < 0)
|
||||
pr_warn("updating the poll flag failed\n");
|
||||
blk_mq_map_swqueue(q);
|
||||
}
|
||||
|
||||
@ -5102,9 +5097,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
|
||||
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
|
||||
struct io_comp_batch *iob, unsigned int flags)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
|
||||
|
||||
return blk_hctx_poll(q, hctx, iob, flags);
|
||||
if (!blk_mq_can_poll(q))
|
||||
return 0;
|
||||
return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
|
||||
}
|
||||
|
||||
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
|
||||
|
@ -163,11 +163,8 @@ struct blk_mq_alloc_data {
|
||||
};
|
||||
|
||||
struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
|
||||
unsigned int reserved_tags, int node, int alloc_policy);
|
||||
unsigned int reserved_tags, unsigned int flags, int node);
|
||||
void blk_mq_free_tags(struct blk_mq_tags *tags);
|
||||
int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
|
||||
struct sbitmap_queue *breserved_tags, unsigned int queue_depth,
|
||||
unsigned int reserved, int node, int alloc_policy);
|
||||
|
||||
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
|
||||
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
|
||||
@ -451,4 +448,10 @@ do { \
|
||||
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
|
||||
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
|
||||
|
||||
static inline bool blk_mq_can_poll(struct request_queue *q)
|
||||
{
|
||||
return (q->limits.features & BLK_FEAT_POLL) &&
|
||||
q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -413,7 +413,8 @@ int blk_set_default_limits(struct queue_limits *lim)
|
||||
* @lim: limits to apply
|
||||
*
|
||||
* Apply the limits in @lim that were obtained from queue_limits_start_update()
|
||||
* and updated by the caller to @q.
|
||||
* and updated by the caller to @q. The caller must have frozen the queue or
|
||||
* ensure that there are no outstanding I/Os by other means.
|
||||
*
|
||||
* Returns 0 if successful, else a negative error code.
|
||||
*/
|
||||
@ -443,6 +444,30 @@ out_unlock:
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
|
||||
|
||||
/**
|
||||
* queue_limits_commit_update_frozen - commit an atomic update of queue limits
|
||||
* @q: queue to update
|
||||
* @lim: limits to apply
|
||||
*
|
||||
* Apply the limits in @lim that were obtained from queue_limits_start_update()
|
||||
* and updated with the new values by the caller to @q. Freezes the queue
|
||||
* before the update and unfreezes it after.
|
||||
*
|
||||
* Returns 0 if successful, else a negative error code.
|
||||
*/
|
||||
int queue_limits_commit_update_frozen(struct request_queue *q,
|
||||
struct queue_limits *lim)
|
||||
{
|
||||
int ret;
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
ret = queue_limits_commit_update(q, lim);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen);
|
||||
|
||||
/**
|
||||
* queue_limits_set - apply queue limits to queue
|
||||
* @q: queue to update
|
||||
|
@ -24,6 +24,8 @@ struct queue_sysfs_entry {
|
||||
struct attribute attr;
|
||||
ssize_t (*show)(struct gendisk *disk, char *page);
|
||||
ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
|
||||
int (*store_limit)(struct gendisk *disk, const char *page,
|
||||
size_t count, struct queue_limits *lim);
|
||||
void (*load_module)(struct gendisk *disk, const char *page, size_t count);
|
||||
};
|
||||
|
||||
@ -153,13 +155,11 @@ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0)
|
||||
QUEUE_SYSFS_SHOW_CONST(write_same_max, 0)
|
||||
QUEUE_SYSFS_SHOW_CONST(poll_delay, -1)
|
||||
|
||||
static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
|
||||
const char *page, size_t count)
|
||||
static int queue_max_discard_sectors_store(struct gendisk *disk,
|
||||
const char *page, size_t count, struct queue_limits *lim)
|
||||
{
|
||||
unsigned long max_discard_bytes;
|
||||
struct queue_limits lim;
|
||||
ssize_t ret;
|
||||
int err;
|
||||
|
||||
ret = queue_var_store(&max_discard_bytes, page, count);
|
||||
if (ret < 0)
|
||||
@ -171,38 +171,28 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
|
||||
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
lim = queue_limits_start_update(disk->queue);
|
||||
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
|
||||
err = queue_limits_commit_update(disk->queue, &lim);
|
||||
if (err)
|
||||
return err;
|
||||
return ret;
|
||||
lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
|
||||
static int
|
||||
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
|
||||
struct queue_limits *lim)
|
||||
{
|
||||
unsigned long max_sectors_kb;
|
||||
struct queue_limits lim;
|
||||
ssize_t ret;
|
||||
int err;
|
||||
|
||||
ret = queue_var_store(&max_sectors_kb, page, count);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
lim = queue_limits_start_update(disk->queue);
|
||||
lim.max_user_sectors = max_sectors_kb << 1;
|
||||
err = queue_limits_commit_update(disk->queue, &lim);
|
||||
if (err)
|
||||
return err;
|
||||
return ret;
|
||||
lim->max_user_sectors = max_sectors_kb << 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
|
||||
size_t count, blk_features_t feature)
|
||||
size_t count, struct queue_limits *lim, blk_features_t feature)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
unsigned long val;
|
||||
ssize_t ret;
|
||||
|
||||
@ -210,15 +200,11 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
lim = queue_limits_start_update(disk->queue);
|
||||
if (val)
|
||||
lim.features |= feature;
|
||||
lim->features |= feature;
|
||||
else
|
||||
lim.features &= ~feature;
|
||||
ret = queue_limits_commit_update(disk->queue, &lim);
|
||||
if (ret)
|
||||
return ret;
|
||||
return count;
|
||||
lim->features &= ~feature;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define QUEUE_SYSFS_FEATURE(_name, _feature) \
|
||||
@ -227,10 +213,10 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
|
||||
return sysfs_emit(page, "%u\n", \
|
||||
!!(disk->queue->limits.features & _feature)); \
|
||||
} \
|
||||
static ssize_t queue_##_name##_store(struct gendisk *disk, \
|
||||
const char *page, size_t count) \
|
||||
static int queue_##_name##_store(struct gendisk *disk, \
|
||||
const char *page, size_t count, struct queue_limits *lim) \
|
||||
{ \
|
||||
return queue_feature_store(disk, page, count, _feature); \
|
||||
return queue_feature_store(disk, page, count, lim, _feature); \
|
||||
}
|
||||
|
||||
QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
|
||||
@ -245,10 +231,17 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
|
||||
!!(disk->queue->limits.features & _feature)); \
|
||||
}
|
||||
|
||||
QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL);
|
||||
QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA);
|
||||
QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX);
|
||||
|
||||
static ssize_t queue_poll_show(struct gendisk *disk, char *page)
|
||||
{
|
||||
if (queue_is_mq(disk->queue))
|
||||
return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue));
|
||||
return sysfs_emit(page, "%u\n",
|
||||
!!(disk->queue->limits.features & BLK_FEAT_POLL));
|
||||
}
|
||||
|
||||
static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
|
||||
{
|
||||
if (blk_queue_is_zoned(disk->queue))
|
||||
@ -266,10 +259,9 @@ static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
|
||||
return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page);
|
||||
}
|
||||
|
||||
static ssize_t queue_iostats_passthrough_store(struct gendisk *disk,
|
||||
const char *page, size_t count)
|
||||
static int queue_iostats_passthrough_store(struct gendisk *disk,
|
||||
const char *page, size_t count, struct queue_limits *lim)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
unsigned long ios;
|
||||
ssize_t ret;
|
||||
|
||||
@ -277,18 +269,13 @@ static ssize_t queue_iostats_passthrough_store(struct gendisk *disk,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
lim = queue_limits_start_update(disk->queue);
|
||||
if (ios)
|
||||
lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
|
||||
lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
|
||||
else
|
||||
lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
|
||||
|
||||
ret = queue_limits_commit_update(disk->queue, &lim);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return count;
|
||||
lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t queue_nomerges_show(struct gendisk *disk, char *page)
|
||||
{
|
||||
return queue_var_show((blk_queue_nomerges(disk->queue) << 1) |
|
||||
@ -391,12 +378,10 @@ static ssize_t queue_wc_show(struct gendisk *disk, char *page)
|
||||
return sysfs_emit(page, "write through\n");
|
||||
}
|
||||
|
||||
static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
|
||||
size_t count)
|
||||
static int queue_wc_store(struct gendisk *disk, const char *page,
|
||||
size_t count, struct queue_limits *lim)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
bool disable;
|
||||
int err;
|
||||
|
||||
if (!strncmp(page, "write back", 10)) {
|
||||
disable = false;
|
||||
@ -407,15 +392,11 @@ static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
lim = queue_limits_start_update(disk->queue);
|
||||
if (disable)
|
||||
lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
|
||||
lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
|
||||
else
|
||||
lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
|
||||
err = queue_limits_commit_update(disk->queue, &lim);
|
||||
if (err)
|
||||
return err;
|
||||
return count;
|
||||
lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define QUEUE_RO_ENTRY(_prefix, _name) \
|
||||
@ -431,6 +412,13 @@ static struct queue_sysfs_entry _prefix##_entry = { \
|
||||
.store = _prefix##_store, \
|
||||
};
|
||||
|
||||
#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
|
||||
static struct queue_sysfs_entry _prefix##_entry = { \
|
||||
.attr = { .name = _name, .mode = 0644 }, \
|
||||
.show = _prefix##_show, \
|
||||
.store_limit = _prefix##_store, \
|
||||
}
|
||||
|
||||
#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \
|
||||
static struct queue_sysfs_entry _prefix##_entry = { \
|
||||
.attr = { .name = _name, .mode = 0644 }, \
|
||||
@ -441,7 +429,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \
|
||||
|
||||
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
|
||||
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
|
||||
QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
|
||||
QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
|
||||
QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
|
||||
QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
|
||||
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
|
||||
@ -457,7 +445,7 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
|
||||
QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
|
||||
QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
|
||||
QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
|
||||
QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
|
||||
QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
|
||||
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
|
||||
|
||||
QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
|
||||
@ -477,11 +465,11 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
|
||||
QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
|
||||
|
||||
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
|
||||
QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
|
||||
QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
|
||||
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
|
||||
QUEUE_RW_ENTRY(queue_poll, "io_poll");
|
||||
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
|
||||
QUEUE_RW_ENTRY(queue_wc, "write_cache");
|
||||
QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache");
|
||||
QUEUE_RO_ENTRY(queue_fua, "fua");
|
||||
QUEUE_RO_ENTRY(queue_dax, "dax");
|
||||
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
|
||||
@ -494,10 +482,10 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = {
|
||||
.show = queue_logical_block_size_show,
|
||||
};
|
||||
|
||||
QUEUE_RW_ENTRY(queue_rotational, "rotational");
|
||||
QUEUE_RW_ENTRY(queue_iostats, "iostats");
|
||||
QUEUE_RW_ENTRY(queue_add_random, "add_random");
|
||||
QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
|
||||
QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational");
|
||||
QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats");
|
||||
QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random");
|
||||
QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes");
|
||||
|
||||
#ifdef CONFIG_BLK_WBT
|
||||
static ssize_t queue_var_store64(s64 *var, const char *page)
|
||||
@ -693,9 +681,10 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
|
||||
struct queue_sysfs_entry *entry = to_queue(attr);
|
||||
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
|
||||
struct request_queue *q = disk->queue;
|
||||
unsigned int noio_flag;
|
||||
ssize_t res;
|
||||
|
||||
if (!entry->store)
|
||||
if (!entry->store_limit && !entry->store)
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
@ -706,11 +695,28 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
|
||||
if (entry->load_module)
|
||||
entry->load_module(disk, page, length);
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
if (entry->store_limit) {
|
||||
struct queue_limits lim = queue_limits_start_update(q);
|
||||
|
||||
res = entry->store_limit(disk, page, length, &lim);
|
||||
if (res < 0) {
|
||||
queue_limits_cancel_update(q);
|
||||
return res;
|
||||
}
|
||||
|
||||
res = queue_limits_commit_update_frozen(q, &lim);
|
||||
if (res)
|
||||
return res;
|
||||
return length;
|
||||
}
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
blk_mq_freeze_queue(q);
|
||||
noio_flag = memalloc_noio_save();
|
||||
res = entry->store(disk, page, length);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
memalloc_noio_restore(noio_flag);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -11,12 +11,8 @@
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <linux/mempool.h>
|
||||
@ -463,6 +459,8 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
|
||||
static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
|
||||
struct blk_zone_wplug *zwplug)
|
||||
{
|
||||
lockdep_assert_held(&zwplug->lock);
|
||||
|
||||
/* If the zone write plug was already removed, we are done. */
|
||||
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
|
||||
return false;
|
||||
@ -584,6 +582,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
|
||||
bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
|
||||
bio_io_error(bio);
|
||||
disk_put_zone_wplug(zwplug);
|
||||
/* Drop the reference taken by disk_zone_wplug_add_bio(() */
|
||||
blk_queue_exit(q);
|
||||
}
|
||||
|
||||
@ -895,10 +894,7 @@ void blk_zone_write_plug_init_request(struct request *req)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop the extra reference on the queue usage we got when
|
||||
* plugging the BIO and advance the write pointer offset.
|
||||
*/
|
||||
/* Drop the reference taken by disk_zone_wplug_add_bio(). */
|
||||
blk_queue_exit(q);
|
||||
zwplug->wp_offset += bio_sectors(bio);
|
||||
|
||||
@ -917,6 +913,8 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
|
||||
{
|
||||
struct gendisk *disk = bio->bi_bdev->bd_disk;
|
||||
|
||||
lockdep_assert_held(&zwplug->lock);
|
||||
|
||||
/*
|
||||
* If we lost track of the zone write pointer due to a write error,
|
||||
* the user must either execute a report zones, reset the zone or finish
|
||||
@ -1446,7 +1444,6 @@ static int disk_update_zone_resources(struct gendisk *disk,
|
||||
unsigned int nr_seq_zones, nr_conv_zones;
|
||||
unsigned int pool_size;
|
||||
struct queue_limits lim;
|
||||
int ret;
|
||||
|
||||
disk->nr_zones = args->nr_zones;
|
||||
disk->zone_capacity = args->zone_capacity;
|
||||
@ -1497,11 +1494,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
|
||||
}
|
||||
|
||||
commit:
|
||||
blk_mq_freeze_queue(q);
|
||||
ret = queue_limits_commit_update(q, &lim);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
return ret;
|
||||
return queue_limits_commit_update_frozen(q, &lim);
|
||||
}
|
||||
|
||||
static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
|
||||
@ -1776,37 +1769,41 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
|
||||
EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
|
||||
|
||||
#ifdef CONFIG_BLK_DEBUG_FS
|
||||
static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
|
||||
struct seq_file *m)
|
||||
{
|
||||
unsigned int zwp_wp_offset, zwp_flags;
|
||||
unsigned int zwp_zone_no, zwp_ref;
|
||||
unsigned int zwp_bio_list_size;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&zwplug->lock, flags);
|
||||
zwp_zone_no = zwplug->zone_no;
|
||||
zwp_flags = zwplug->flags;
|
||||
zwp_ref = refcount_read(&zwplug->ref);
|
||||
zwp_wp_offset = zwplug->wp_offset;
|
||||
zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
|
||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||
|
||||
seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
|
||||
zwp_wp_offset, zwp_bio_list_size);
|
||||
}
|
||||
|
||||
int queue_zone_wplugs_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
struct gendisk *disk = q->disk;
|
||||
struct blk_zone_wplug *zwplug;
|
||||
unsigned int zwp_wp_offset, zwp_flags;
|
||||
unsigned int zwp_zone_no, zwp_ref;
|
||||
unsigned int zwp_bio_list_size, i;
|
||||
unsigned long flags;
|
||||
unsigned int i;
|
||||
|
||||
if (!disk->zone_wplugs_hash)
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
|
||||
hlist_for_each_entry_rcu(zwplug,
|
||||
&disk->zone_wplugs_hash[i], node) {
|
||||
spin_lock_irqsave(&zwplug->lock, flags);
|
||||
zwp_zone_no = zwplug->zone_no;
|
||||
zwp_flags = zwplug->flags;
|
||||
zwp_ref = refcount_read(&zwplug->ref);
|
||||
zwp_wp_offset = zwplug->wp_offset;
|
||||
zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
|
||||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||
|
||||
seq_printf(m, "%u 0x%x %u %u %u\n",
|
||||
zwp_zone_no, zwp_flags, zwp_ref,
|
||||
zwp_wp_offset, zwp_bio_list_size);
|
||||
}
|
||||
}
|
||||
for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
|
||||
hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
|
||||
node)
|
||||
queue_zone_wplug_show(zwplug, m);
|
||||
rcu_read_unlock();
|
||||
|
||||
return 0;
|
||||
|
31
block/blk.h
31
block/blk.h
@ -556,14 +556,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
|
||||
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
|
||||
struct lock_class_key *lkclass);
|
||||
|
||||
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
unsigned int max_sectors, bool *same_page);
|
||||
|
||||
int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
|
||||
struct folio *folio, size_t len, size_t offset,
|
||||
unsigned int max_sectors, bool *same_page);
|
||||
|
||||
/*
|
||||
* Clean up a page appropriately, where the page may be pinned, may have a
|
||||
* ref taken on it or neither.
|
||||
@ -720,22 +712,29 @@ void blk_integrity_verify(struct bio *bio);
|
||||
void blk_integrity_prepare(struct request *rq);
|
||||
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
|
||||
|
||||
static inline void blk_freeze_acquire_lock(struct request_queue *q, bool
|
||||
disk_dead, bool queue_dying)
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
static inline void blk_freeze_acquire_lock(struct request_queue *q)
|
||||
{
|
||||
if (!disk_dead)
|
||||
if (!q->mq_freeze_disk_dead)
|
||||
rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
|
||||
if (!queue_dying)
|
||||
if (!q->mq_freeze_queue_dying)
|
||||
rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
|
||||
}
|
||||
|
||||
static inline void blk_unfreeze_release_lock(struct request_queue *q, bool
|
||||
disk_dead, bool queue_dying)
|
||||
static inline void blk_unfreeze_release_lock(struct request_queue *q)
|
||||
{
|
||||
if (!queue_dying)
|
||||
if (!q->mq_freeze_queue_dying)
|
||||
rwsem_release(&q->q_lockdep_map, _RET_IP_);
|
||||
if (!disk_dead)
|
||||
if (!q->mq_freeze_disk_dead)
|
||||
rwsem_release(&q->io_lockdep_map, _RET_IP_);
|
||||
}
|
||||
#else
|
||||
static inline void blk_freeze_acquire_lock(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
static inline void blk_unfreeze_release_lock(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BLK_INTERNAL_H */
|
||||
|
@ -381,7 +381,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
|
||||
set->queue_depth = 128;
|
||||
set->numa_node = NUMA_NO_NODE;
|
||||
set->cmd_size = sizeof(struct bsg_job) + dd_job_size;
|
||||
set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING;
|
||||
set->flags = BLK_MQ_F_BLOCKING;
|
||||
if (blk_mq_alloc_tag_set(set))
|
||||
goto out_tag_set;
|
||||
|
||||
|
@ -405,12 +405,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
|
||||
#define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr)
|
||||
|
||||
static ssize_t
|
||||
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
|
||||
{
|
||||
struct elv_fs_entry *entry = to_elv(attr);
|
||||
const struct elv_fs_entry *entry = to_elv(attr);
|
||||
struct elevator_queue *e;
|
||||
ssize_t error;
|
||||
|
||||
@ -428,7 +428,7 @@ static ssize_t
|
||||
elv_attr_store(struct kobject *kobj, struct attribute *attr,
|
||||
const char *page, size_t length)
|
||||
{
|
||||
struct elv_fs_entry *entry = to_elv(attr);
|
||||
const struct elv_fs_entry *entry = to_elv(attr);
|
||||
struct elevator_queue *e;
|
||||
ssize_t error;
|
||||
|
||||
@ -461,7 +461,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
|
||||
|
||||
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
|
||||
if (!error) {
|
||||
struct elv_fs_entry *attr = e->type->elevator_attrs;
|
||||
const struct elv_fs_entry *attr = e->type->elevator_attrs;
|
||||
if (attr) {
|
||||
while (attr->attr.name) {
|
||||
if (sysfs_create_file(&e->kobj, &attr->attr))
|
||||
@ -547,14 +547,6 @@ void elv_unregister(struct elevator_type *e)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(elv_unregister);
|
||||
|
||||
static inline bool elv_support_iosched(struct request_queue *q)
|
||||
{
|
||||
if (!queue_is_mq(q) ||
|
||||
(q->tag_set->flags & BLK_MQ_F_NO_SCHED))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* For single queue devices, default to using mq-deadline. If we have multiple
|
||||
* queues or mq-deadline is not available, default to "none".
|
||||
@ -580,9 +572,6 @@ void elevator_init_mq(struct request_queue *q)
|
||||
struct elevator_type *e;
|
||||
int err;
|
||||
|
||||
if (!elv_support_iosched(q))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(blk_queue_registered(q));
|
||||
|
||||
if (unlikely(q->elevator))
|
||||
@ -601,16 +590,13 @@ void elevator_init_mq(struct request_queue *q)
|
||||
*
|
||||
* Disk isn't added yet, so verifying queue lock only manually.
|
||||
*/
|
||||
blk_freeze_queue_start_non_owner(q);
|
||||
blk_freeze_acquire_lock(q, true, false);
|
||||
blk_mq_freeze_queue_wait(q);
|
||||
blk_mq_freeze_queue(q);
|
||||
|
||||
blk_mq_cancel_work_sync(q);
|
||||
|
||||
err = blk_mq_init_sched(q, e);
|
||||
|
||||
blk_unfreeze_release_lock(q, true, false);
|
||||
blk_mq_unfreeze_queue_non_owner(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
if (err) {
|
||||
pr_warn("\"%s\" elevator initialization failed, "
|
||||
@ -717,9 +703,6 @@ void elv_iosched_load_module(struct gendisk *disk, const char *buf,
|
||||
struct elevator_type *found;
|
||||
const char *name;
|
||||
|
||||
if (!elv_support_iosched(disk->queue))
|
||||
return;
|
||||
|
||||
strscpy(elevator_name, buf, sizeof(elevator_name));
|
||||
name = strstrip(elevator_name);
|
||||
|
||||
@ -737,9 +720,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
|
||||
char elevator_name[ELV_NAME_MAX];
|
||||
int ret;
|
||||
|
||||
if (!elv_support_iosched(disk->queue))
|
||||
return count;
|
||||
|
||||
strscpy(elevator_name, buf, sizeof(elevator_name));
|
||||
ret = elevator_change(disk->queue, strstrip(elevator_name));
|
||||
if (!ret)
|
||||
@ -754,9 +734,6 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
|
||||
struct elevator_type *cur = NULL, *e;
|
||||
int len = 0;
|
||||
|
||||
if (!elv_support_iosched(q))
|
||||
return sprintf(name, "none\n");
|
||||
|
||||
if (!q->elevator) {
|
||||
len += sprintf(name+len, "[none] ");
|
||||
} else {
|
||||
|
@ -71,7 +71,7 @@ struct elevator_type
|
||||
|
||||
size_t icq_size; /* see iocontext.h */
|
||||
size_t icq_align; /* ditto */
|
||||
struct elv_fs_entry *elevator_attrs;
|
||||
const struct elv_fs_entry *elevator_attrs;
|
||||
const char *elevator_name;
|
||||
const char *elevator_alias;
|
||||
struct module *elevator_owner;
|
||||
|
45
block/fops.c
45
block/fops.c
@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
struct bio bio;
|
||||
ssize_t ret;
|
||||
|
||||
WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
|
||||
if (nr_pages <= DIO_INLINE_BIO_VECS)
|
||||
vecs = inline_vecs;
|
||||
else {
|
||||
@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio)
|
||||
{
|
||||
struct blkdev_dio *dio = bio->bi_private;
|
||||
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
|
||||
bool is_sync = dio->flags & DIO_IS_SYNC;
|
||||
|
||||
if (bio->bi_status && !dio->bio.bi_status)
|
||||
dio->bio.bi_status = bio->bi_status;
|
||||
|
||||
if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
|
||||
bio_integrity_unmap_user(bio);
|
||||
|
||||
if (atomic_dec_and_test(&dio->ref)) {
|
||||
if (!(dio->flags & DIO_IS_SYNC)) {
|
||||
if (!is_sync) {
|
||||
struct kiocb *iocb = dio->iocb;
|
||||
ssize_t ret;
|
||||
|
||||
@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
* a retry of this from blocking context.
|
||||
*/
|
||||
if (unlikely(iov_iter_count(iter))) {
|
||||
bio_release_pages(bio, false);
|
||||
bio_clear_flag(bio, BIO_REFFED);
|
||||
bio_put(bio);
|
||||
blk_finish_plug(&plug);
|
||||
return -EAGAIN;
|
||||
ret = -EAGAIN;
|
||||
goto fail;
|
||||
}
|
||||
bio->bi_opf |= REQ_NOWAIT;
|
||||
}
|
||||
if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
|
||||
ret = bio_integrity_map_iter(bio, iocb->private);
|
||||
if (unlikely(ret))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (is_read) {
|
||||
if (dio->flags & DIO_SHOULD_DIRTY)
|
||||
@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
||||
bio_put(&dio->bio);
|
||||
return ret;
|
||||
fail:
|
||||
bio_release_pages(bio, false);
|
||||
bio_clear_flag(bio, BIO_REFFED);
|
||||
bio_put(bio);
|
||||
blk_finish_plug(&plug);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void blkdev_bio_end_io_async(struct bio *bio)
|
||||
@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
|
||||
ret = blk_status_to_errno(bio->bi_status);
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_HAS_METADATA)
|
||||
bio_integrity_unmap_user(bio);
|
||||
|
||||
iocb->ki_complete(iocb, ret);
|
||||
|
||||
if (dio->flags & DIO_SHOULD_DIRTY) {
|
||||
@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
bio_iov_bvec_set(bio, iter);
|
||||
} else {
|
||||
ret = bio_iov_iter_get_pages(bio, iter);
|
||||
if (unlikely(ret)) {
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
if (unlikely(ret))
|
||||
goto out_bio_put;
|
||||
}
|
||||
dio->size = bio->bi_iter.bi_size;
|
||||
|
||||
@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
task_io_account_write(bio->bi_iter.bi_size);
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_HAS_METADATA) {
|
||||
ret = bio_integrity_map_iter(bio, iocb->private);
|
||||
WRITE_ONCE(iocb->private, NULL);
|
||||
if (unlikely(ret))
|
||||
goto out_bio_put;
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_ATOMIC)
|
||||
bio->bi_opf |= REQ_ATOMIC;
|
||||
|
||||
@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
submit_bio(bio);
|
||||
}
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
out_bio_put:
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
|
@ -400,21 +400,23 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
||||
struct device *ddev = disk_to_dev(disk);
|
||||
int ret;
|
||||
|
||||
/* Only makes sense for bio-based to set ->poll_bio */
|
||||
if (queue_is_mq(disk->queue) && disk->fops->poll_bio)
|
||||
return -EINVAL;
|
||||
if (queue_is_mq(disk->queue)) {
|
||||
/*
|
||||
* ->submit_bio and ->poll_bio are bypassed for blk-mq drivers.
|
||||
*/
|
||||
if (disk->fops->submit_bio || disk->fops->poll_bio)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* The disk queue should now be all set with enough information about
|
||||
* the device for the elevator code to pick an adequate default
|
||||
* elevator if one is needed, that is, for devices requesting queue
|
||||
* registration.
|
||||
*/
|
||||
elevator_init_mq(disk->queue);
|
||||
|
||||
/* Mark bdev as having a submit_bio, if needed */
|
||||
if (disk->fops->submit_bio)
|
||||
/*
|
||||
* Initialize the I/O scheduler code and pick a default one if
|
||||
* needed.
|
||||
*/
|
||||
elevator_init_mq(disk->queue);
|
||||
} else {
|
||||
if (!disk->fops->submit_bio)
|
||||
return -EINVAL;
|
||||
bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the driver provides an explicit major number it also must provide
|
||||
@ -661,7 +663,7 @@ void del_gendisk(struct gendisk *disk)
|
||||
struct request_queue *q = disk->queue;
|
||||
struct block_device *part;
|
||||
unsigned long idx;
|
||||
bool start_drain, queue_dying;
|
||||
bool start_drain;
|
||||
|
||||
might_sleep();
|
||||
|
||||
@ -690,9 +692,8 @@ void del_gendisk(struct gendisk *disk)
|
||||
*/
|
||||
mutex_lock(&disk->open_mutex);
|
||||
start_drain = __blk_mark_disk_dead(disk);
|
||||
queue_dying = blk_queue_dying(q);
|
||||
if (start_drain)
|
||||
blk_freeze_acquire_lock(q, true, queue_dying);
|
||||
blk_freeze_acquire_lock(q);
|
||||
xa_for_each_start(&disk->part_tbl, idx, part, 1)
|
||||
drop_partition(part);
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
@ -748,7 +749,7 @@ void del_gendisk(struct gendisk *disk)
|
||||
blk_mq_exit_queue(q);
|
||||
|
||||
if (start_drain)
|
||||
blk_unfreeze_release_lock(q, true, queue_dying);
|
||||
blk_unfreeze_release_lock(q);
|
||||
}
|
||||
EXPORT_SYMBOL(del_gendisk);
|
||||
|
||||
@ -798,7 +799,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
|
||||
void blk_request_module(dev_t devt)
|
||||
static bool blk_probe_dev(dev_t devt)
|
||||
{
|
||||
unsigned int major = MAJOR(devt);
|
||||
struct blk_major_name **n;
|
||||
@ -808,14 +809,26 @@ void blk_request_module(dev_t devt)
|
||||
if ((*n)->major == major && (*n)->probe) {
|
||||
(*n)->probe(devt);
|
||||
mutex_unlock(&major_names_lock);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&major_names_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
|
||||
/* Make old-style 2.4 aliases work */
|
||||
request_module("block-major-%d", MAJOR(devt));
|
||||
void blk_request_module(dev_t devt)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (blk_probe_dev(devt))
|
||||
return;
|
||||
|
||||
error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
|
||||
/* Make old-style 2.4 aliases work */
|
||||
if (error > 0)
|
||||
error = request_module("block-major-%d", MAJOR(devt));
|
||||
if (!error)
|
||||
blk_probe_dev(devt);
|
||||
}
|
||||
#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
|
||||
|
||||
|
@ -889,7 +889,7 @@ KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
|
||||
#undef KYBER_LAT_SHOW_STORE
|
||||
|
||||
#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
|
||||
static struct elv_fs_entry kyber_sched_attrs[] = {
|
||||
static const struct elv_fs_entry kyber_sched_attrs[] = {
|
||||
KYBER_LAT_ATTR(read),
|
||||
KYBER_LAT_ATTR(write),
|
||||
__ATTR_NULL
|
||||
|
@ -834,7 +834,7 @@ STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
|
||||
#define DD_ATTR(name) \
|
||||
__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
|
||||
|
||||
static struct elv_fs_entry deadline_attrs[] = {
|
||||
static const struct elv_fs_entry deadline_attrs[] = {
|
||||
DD_ATTR(read_expire),
|
||||
DD_ATTR(write_expire),
|
||||
DD_ATTR(writes_starved),
|
||||
|
@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/**
|
||||
/*
|
||||
* ldm - Part of the Linux-NTFS project.
|
||||
*
|
||||
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
|
||||
|
@ -396,7 +396,7 @@ extern const struct attribute_group *ahci_sdev_groups[];
|
||||
.shost_groups = ahci_shost_groups, \
|
||||
.sdev_groups = ahci_sdev_groups, \
|
||||
.change_queue_depth = ata_scsi_change_queue_depth, \
|
||||
.tag_alloc_policy = BLK_TAG_ALLOC_RR, \
|
||||
.tag_alloc_policy_rr = true, \
|
||||
.device_configure = ata_scsi_device_configure
|
||||
|
||||
extern struct ata_port_operations ahci_ops;
|
||||
|
@ -935,7 +935,7 @@ static const struct scsi_host_template pata_macio_sht = {
|
||||
.device_configure = pata_macio_device_configure,
|
||||
.sdev_groups = ata_common_sdev_groups,
|
||||
.can_queue = ATA_DEF_QUEUE,
|
||||
.tag_alloc_policy = BLK_TAG_ALLOC_RR,
|
||||
.tag_alloc_policy_rr = true,
|
||||
};
|
||||
|
||||
static struct ata_port_operations pata_macio_ops = {
|
||||
|
@ -672,7 +672,7 @@ static const struct scsi_host_template mv6_sht = {
|
||||
.dma_boundary = MV_DMA_BOUNDARY,
|
||||
.sdev_groups = ata_ncq_sdev_groups,
|
||||
.change_queue_depth = ata_scsi_change_queue_depth,
|
||||
.tag_alloc_policy = BLK_TAG_ALLOC_RR,
|
||||
.tag_alloc_policy_rr = true,
|
||||
.device_configure = ata_scsi_device_configure
|
||||
};
|
||||
|
||||
|
@ -385,7 +385,7 @@ static const struct scsi_host_template nv_adma_sht = {
|
||||
.device_configure = nv_adma_device_configure,
|
||||
.sdev_groups = ata_ncq_sdev_groups,
|
||||
.change_queue_depth = ata_scsi_change_queue_depth,
|
||||
.tag_alloc_policy = BLK_TAG_ALLOC_RR,
|
||||
.tag_alloc_policy_rr = true,
|
||||
};
|
||||
|
||||
static const struct scsi_host_template nv_swncq_sht = {
|
||||
@ -396,7 +396,7 @@ static const struct scsi_host_template nv_swncq_sht = {
|
||||
.device_configure = nv_swncq_device_configure,
|
||||
.sdev_groups = ata_ncq_sdev_groups,
|
||||
.change_queue_depth = ata_scsi_change_queue_depth,
|
||||
.tag_alloc_policy = BLK_TAG_ALLOC_RR,
|
||||
.tag_alloc_policy_rr = true,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -378,7 +378,6 @@ static const struct scsi_host_template sil24_sht = {
|
||||
.can_queue = SIL24_MAX_CMDS,
|
||||
.sg_tablesize = SIL24_MAX_SGE,
|
||||
.dma_boundary = ATA_DMA_BOUNDARY,
|
||||
.tag_alloc_policy = BLK_TAG_ALLOC_FIFO,
|
||||
.sdev_groups = ata_ncq_sdev_groups,
|
||||
.change_queue_depth = ata_scsi_change_queue_depth,
|
||||
.device_configure = ata_scsi_device_configure
|
||||
|
@ -1819,7 +1819,6 @@ static int fd_alloc_drive(int drive)
|
||||
unit[drive].tag_set.nr_maps = 1;
|
||||
unit[drive].tag_set.queue_depth = 2;
|
||||
unit[drive].tag_set.numa_node = NUMA_NO_NODE;
|
||||
unit[drive].tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
if (blk_mq_alloc_tag_set(&unit[drive].tag_set))
|
||||
goto out_cleanup_trackbuf;
|
||||
|
||||
|
@ -368,7 +368,6 @@ aoeblk_gdalloc(void *vp)
|
||||
set->nr_hw_queues = 1;
|
||||
set->queue_depth = 128;
|
||||
set->numa_node = NUMA_NO_NODE;
|
||||
set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
err = blk_mq_alloc_tag_set(set);
|
||||
if (err) {
|
||||
pr_err("aoe: cannot allocate tag set for %ld.%d\n",
|
||||
|
@ -2088,7 +2088,6 @@ static int __init atari_floppy_init (void)
|
||||
unit[i].tag_set.nr_maps = 1;
|
||||
unit[i].tag_set.queue_depth = 2;
|
||||
unit[i].tag_set.numa_node = NUMA_NO_NODE;
|
||||
unit[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
ret = blk_mq_alloc_tag_set(&unit[i].tag_set);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
@ -4596,7 +4596,6 @@ static int __init do_floppy_init(void)
|
||||
tag_sets[drive].nr_maps = 1;
|
||||
tag_sets[drive].queue_depth = 2;
|
||||
tag_sets[drive].numa_node = NUMA_NO_NODE;
|
||||
tag_sets[drive].flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
err = blk_mq_alloc_tag_set(&tag_sets[drive]);
|
||||
if (err)
|
||||
goto out_put_disk;
|
||||
|
@ -68,7 +68,6 @@ struct loop_device {
|
||||
struct list_head idle_worker_list;
|
||||
struct rb_root worker_tree;
|
||||
struct timer_list timer;
|
||||
bool use_dio;
|
||||
bool sysfs_inited;
|
||||
|
||||
struct request_queue *lo_queue;
|
||||
@ -182,41 +181,44 @@ static bool lo_bdev_can_use_dio(struct loop_device *lo,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void __loop_update_dio(struct loop_device *lo, bool dio)
|
||||
static bool lo_can_use_dio(struct loop_device *lo)
|
||||
{
|
||||
struct file *file = lo->lo_backing_file;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct block_device *backing_bdev = NULL;
|
||||
bool use_dio;
|
||||
struct inode *inode = lo->lo_backing_file->f_mapping->host;
|
||||
|
||||
if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT))
|
||||
return false;
|
||||
|
||||
if (S_ISBLK(inode->i_mode))
|
||||
backing_bdev = I_BDEV(inode);
|
||||
else if (inode->i_sb->s_bdev)
|
||||
backing_bdev = inode->i_sb->s_bdev;
|
||||
return lo_bdev_can_use_dio(lo, I_BDEV(inode));
|
||||
if (inode->i_sb->s_bdev)
|
||||
return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev);
|
||||
return true;
|
||||
}
|
||||
|
||||
use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) &&
|
||||
(!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev));
|
||||
/*
|
||||
* Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by
|
||||
* passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently
|
||||
* disabled when the device block size is too small or the offset is unaligned.
|
||||
*
|
||||
* loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and
|
||||
* not the originally passed in one.
|
||||
*/
|
||||
static inline void loop_update_dio(struct loop_device *lo)
|
||||
{
|
||||
bool dio_in_use = lo->lo_flags & LO_FLAGS_DIRECT_IO;
|
||||
|
||||
if (lo->use_dio == use_dio)
|
||||
return;
|
||||
lockdep_assert_held(&lo->lo_mutex);
|
||||
WARN_ON_ONCE(lo->lo_state == Lo_bound &&
|
||||
lo->lo_queue->mq_freeze_depth == 0);
|
||||
|
||||
/* flush dirty pages before changing direct IO */
|
||||
vfs_fsync(file, 0);
|
||||
|
||||
/*
|
||||
* The flag of LO_FLAGS_DIRECT_IO is handled similarly with
|
||||
* LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
|
||||
* will get updated by ioctl(LOOP_GET_STATUS)
|
||||
*/
|
||||
if (lo->lo_state == Lo_bound)
|
||||
blk_mq_freeze_queue(lo->lo_queue);
|
||||
lo->use_dio = use_dio;
|
||||
if (use_dio)
|
||||
if (lo->lo_backing_file->f_flags & O_DIRECT)
|
||||
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
|
||||
else
|
||||
if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo))
|
||||
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
|
||||
if (lo->lo_state == Lo_bound)
|
||||
blk_mq_unfreeze_queue(lo->lo_queue);
|
||||
|
||||
/* flush dirty pages before starting to issue direct I/O */
|
||||
if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !dio_in_use)
|
||||
vfs_fsync(lo->lo_backing_file, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -311,6 +313,13 @@ static void loop_clear_limits(struct loop_device *lo, int mode)
|
||||
lim.discard_granularity = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX: this updates the queue limits without freezing the queue, which
|
||||
* is against the locking protocol and dangerous. But we can't just
|
||||
* freeze the queue as we're inside the ->queue_rq method here. So this
|
||||
* should move out into a workqueue unless we get the file operations to
|
||||
* advertise if they support specific fallocate operations.
|
||||
*/
|
||||
queue_limits_commit_update(lo->lo_queue, &lim);
|
||||
}
|
||||
|
||||
@ -520,12 +529,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void loop_update_dio(struct loop_device *lo)
|
||||
{
|
||||
__loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) |
|
||||
lo->use_dio);
|
||||
}
|
||||
|
||||
static void loop_reread_partitions(struct loop_device *lo)
|
||||
{
|
||||
int rc;
|
||||
@ -964,7 +967,6 @@ loop_set_status_from_info(struct loop_device *lo,
|
||||
|
||||
memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
|
||||
lo->lo_file_name[LO_NAME_SIZE-1] = 0;
|
||||
lo->lo_flags = info->lo_flags;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -977,12 +979,12 @@ static unsigned int loop_default_blocksize(struct loop_device *lo,
|
||||
return SECTOR_SIZE;
|
||||
}
|
||||
|
||||
static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
|
||||
static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
|
||||
unsigned int bsize)
|
||||
{
|
||||
struct file *file = lo->lo_backing_file;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct block_device *backing_bdev = NULL;
|
||||
struct queue_limits lim;
|
||||
u32 granularity = 0, max_discard_sectors = 0;
|
||||
|
||||
if (S_ISBLK(inode->i_mode))
|
||||
@ -995,22 +997,20 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
|
||||
|
||||
loop_get_discard_config(lo, &granularity, &max_discard_sectors);
|
||||
|
||||
lim = queue_limits_start_update(lo->lo_queue);
|
||||
lim.logical_block_size = bsize;
|
||||
lim.physical_block_size = bsize;
|
||||
lim.io_min = bsize;
|
||||
lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
|
||||
lim->logical_block_size = bsize;
|
||||
lim->physical_block_size = bsize;
|
||||
lim->io_min = bsize;
|
||||
lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
|
||||
if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
|
||||
lim.features |= BLK_FEAT_WRITE_CACHE;
|
||||
lim->features |= BLK_FEAT_WRITE_CACHE;
|
||||
if (backing_bdev && !bdev_nonrot(backing_bdev))
|
||||
lim.features |= BLK_FEAT_ROTATIONAL;
|
||||
lim.max_hw_discard_sectors = max_discard_sectors;
|
||||
lim.max_write_zeroes_sectors = max_discard_sectors;
|
||||
lim->features |= BLK_FEAT_ROTATIONAL;
|
||||
lim->max_hw_discard_sectors = max_discard_sectors;
|
||||
lim->max_write_zeroes_sectors = max_discard_sectors;
|
||||
if (max_discard_sectors)
|
||||
lim.discard_granularity = granularity;
|
||||
lim->discard_granularity = granularity;
|
||||
else
|
||||
lim.discard_granularity = 0;
|
||||
return queue_limits_commit_update(lo->lo_queue, &lim);
|
||||
lim->discard_granularity = 0;
|
||||
}
|
||||
|
||||
static int loop_configure(struct loop_device *lo, blk_mode_t mode,
|
||||
@ -1019,6 +1019,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
|
||||
{
|
||||
struct file *file = fget(config->fd);
|
||||
struct address_space *mapping;
|
||||
struct queue_limits lim;
|
||||
int error;
|
||||
loff_t size;
|
||||
bool partscan;
|
||||
@ -1063,6 +1064,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
|
||||
error = loop_set_status_from_info(lo, &config->info);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
lo->lo_flags = config->info.lo_flags;
|
||||
|
||||
if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) ||
|
||||
!file->f_op->write_iter)
|
||||
@ -1084,13 +1086,15 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
|
||||
disk_force_media_change(lo->lo_disk);
|
||||
set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
|
||||
|
||||
lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
|
||||
lo->lo_device = bdev;
|
||||
lo->lo_backing_file = file;
|
||||
lo->old_gfp_mask = mapping_gfp_mask(mapping);
|
||||
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
|
||||
|
||||
error = loop_reconfigure_limits(lo, config->block_size);
|
||||
lim = queue_limits_start_update(lo->lo_queue);
|
||||
loop_update_limits(lo, &lim, config->block_size);
|
||||
/* No need to freeze the queue as the device isn't bound yet. */
|
||||
error = queue_limits_commit_update(lo->lo_queue, &lim);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
|
||||
@ -1150,7 +1154,12 @@ static void __loop_clr_fd(struct loop_device *lo)
|
||||
lo->lo_sizelimit = 0;
|
||||
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
|
||||
|
||||
/* reset the block size to the default */
|
||||
/*
|
||||
* Reset the block size to the default.
|
||||
*
|
||||
* No queue freezing needed because this is called from the final
|
||||
* ->release call only, so there can't be any outstanding I/O.
|
||||
*/
|
||||
lim = queue_limits_start_update(lo->lo_queue);
|
||||
lim.logical_block_size = SECTOR_SIZE;
|
||||
lim.physical_block_size = SECTOR_SIZE;
|
||||
@ -1244,7 +1253,6 @@ static int
|
||||
loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
|
||||
{
|
||||
int err;
|
||||
int prev_lo_flags;
|
||||
bool partscan = false;
|
||||
bool size_changed = false;
|
||||
|
||||
@ -1263,21 +1271,19 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
|
||||
invalidate_bdev(lo->lo_device);
|
||||
}
|
||||
|
||||
/* I/O need to be drained during transfer transition */
|
||||
/* I/O needs to be drained before changing lo_offset or lo_sizelimit */
|
||||
blk_mq_freeze_queue(lo->lo_queue);
|
||||
|
||||
prev_lo_flags = lo->lo_flags;
|
||||
|
||||
err = loop_set_status_from_info(lo, info);
|
||||
if (err)
|
||||
goto out_unfreeze;
|
||||
|
||||
/* Mask out flags that can't be set using LOOP_SET_STATUS. */
|
||||
lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
|
||||
/* For those flags, use the previous values instead */
|
||||
lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS;
|
||||
/* For flags that can't be cleared, use previous values too */
|
||||
lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS;
|
||||
partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) &&
|
||||
(info->lo_flags & LO_FLAGS_PARTSCAN);
|
||||
|
||||
lo->lo_flags &= ~(LOOP_SET_STATUS_SETTABLE_FLAGS |
|
||||
LOOP_SET_STATUS_CLEARABLE_FLAGS);
|
||||
lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS);
|
||||
|
||||
if (size_changed) {
|
||||
loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
|
||||
@ -1285,17 +1291,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
|
||||
loop_set_size(lo, new_size);
|
||||
}
|
||||
|
||||
/* update dio if lo_offset or transfer is changed */
|
||||
__loop_update_dio(lo, lo->use_dio);
|
||||
/* update the direct I/O flag if lo_offset changed */
|
||||
loop_update_dio(lo);
|
||||
|
||||
out_unfreeze:
|
||||
blk_mq_unfreeze_queue(lo->lo_queue);
|
||||
|
||||
if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) &&
|
||||
!(prev_lo_flags & LO_FLAGS_PARTSCAN)) {
|
||||
if (partscan)
|
||||
clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
|
||||
partscan = true;
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&lo->lo_mutex);
|
||||
if (partscan)
|
||||
@ -1444,20 +1446,32 @@ static int loop_set_capacity(struct loop_device *lo)
|
||||
|
||||
static int loop_set_dio(struct loop_device *lo, unsigned long arg)
|
||||
{
|
||||
int error = -ENXIO;
|
||||
if (lo->lo_state != Lo_bound)
|
||||
goto out;
|
||||
bool use_dio = !!arg;
|
||||
|
||||
__loop_update_dio(lo, !!arg);
|
||||
if (lo->use_dio == !!arg)
|
||||
if (lo->lo_state != Lo_bound)
|
||||
return -ENXIO;
|
||||
if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO))
|
||||
return 0;
|
||||
error = -EINVAL;
|
||||
out:
|
||||
return error;
|
||||
|
||||
if (use_dio) {
|
||||
if (!lo_can_use_dio(lo))
|
||||
return -EINVAL;
|
||||
/* flush dirty pages before starting to use direct I/O */
|
||||
vfs_fsync(lo->lo_backing_file, 0);
|
||||
}
|
||||
|
||||
blk_mq_freeze_queue(lo->lo_queue);
|
||||
if (use_dio)
|
||||
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
|
||||
else
|
||||
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
|
||||
blk_mq_unfreeze_queue(lo->lo_queue);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
int err = 0;
|
||||
|
||||
if (lo->lo_state != Lo_bound)
|
||||
@ -1469,8 +1483,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
|
||||
sync_blockdev(lo->lo_device);
|
||||
invalidate_bdev(lo->lo_device);
|
||||
|
||||
lim = queue_limits_start_update(lo->lo_queue);
|
||||
loop_update_limits(lo, &lim, arg);
|
||||
|
||||
blk_mq_freeze_queue(lo->lo_queue);
|
||||
err = loop_reconfigure_limits(lo, arg);
|
||||
err = queue_limits_commit_update(lo->lo_queue, &lim);
|
||||
loop_update_dio(lo);
|
||||
blk_mq_unfreeze_queue(lo->lo_queue);
|
||||
|
||||
@ -1854,7 +1871,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
cmd->use_aio = false;
|
||||
break;
|
||||
default:
|
||||
cmd->use_aio = lo->use_dio;
|
||||
cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2023,8 +2040,7 @@ static int loop_add(int i)
|
||||
lo->tag_set.queue_depth = hw_queue_depth;
|
||||
lo->tag_set.numa_node = NUMA_NO_NODE;
|
||||
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
|
||||
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING |
|
||||
BLK_MQ_F_NO_SCHED_BY_DEFAULT;
|
||||
lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT;
|
||||
lo->tag_set.driver_data = lo;
|
||||
|
||||
err = blk_mq_alloc_tag_set(&lo->tag_set);
|
||||
|
@ -3416,7 +3416,6 @@ static int mtip_block_initialize(struct driver_data *dd)
|
||||
dd->tags.reserved_tags = 1;
|
||||
dd->tags.cmd_size = sizeof(struct mtip_cmd);
|
||||
dd->tags.numa_node = dd->numa_node;
|
||||
dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
dd->tags.driver_data = dd;
|
||||
dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS;
|
||||
|
||||
|
@ -62,6 +62,7 @@ struct nbd_sock {
|
||||
bool dead;
|
||||
int fallback_index;
|
||||
int cookie;
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
struct recv_thread_args {
|
||||
@ -141,6 +142,9 @@ struct nbd_device {
|
||||
*/
|
||||
#define NBD_CMD_INFLIGHT 2
|
||||
|
||||
/* Just part of request header or data payload is sent successfully */
|
||||
#define NBD_CMD_PARTIAL_SEND 3
|
||||
|
||||
struct nbd_cmd {
|
||||
struct nbd_device *nbd;
|
||||
struct mutex lock;
|
||||
@ -327,8 +331,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
|
||||
nsock->sent = 0;
|
||||
}
|
||||
|
||||
static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
|
||||
loff_t blksize)
|
||||
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
int error;
|
||||
@ -368,7 +371,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
|
||||
|
||||
lim.logical_block_size = blksize;
|
||||
lim.physical_block_size = blksize;
|
||||
error = queue_limits_commit_update(nbd->disk->queue, &lim);
|
||||
error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
@ -379,18 +382,6 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
|
||||
loff_t blksize)
|
||||
{
|
||||
int error;
|
||||
|
||||
blk_mq_freeze_queue(nbd->disk->queue);
|
||||
error = __nbd_set_size(nbd, bytesize, blksize);
|
||||
blk_mq_unfreeze_queue(nbd->disk->queue);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static void nbd_complete_rq(struct request *req)
|
||||
{
|
||||
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
|
||||
@ -466,6 +457,12 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
|
||||
if (!mutex_trylock(&cmd->lock))
|
||||
return BLK_EH_RESET_TIMER;
|
||||
|
||||
/* partial send is handled in nbd_sock's work function */
|
||||
if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
|
||||
mutex_unlock(&cmd->lock);
|
||||
return BLK_EH_RESET_TIMER;
|
||||
}
|
||||
|
||||
if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
|
||||
mutex_unlock(&cmd->lock);
|
||||
return BLK_EH_DONE;
|
||||
@ -614,6 +611,30 @@ static inline int was_interrupted(int result)
|
||||
return result == -ERESTARTSYS || result == -EINTR;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've already sent header or part of data payload, have no choice but
|
||||
* to set pending and schedule it in work.
|
||||
*
|
||||
* And we have to return BLK_STS_OK to block core, otherwise this same
|
||||
* request may be re-dispatched with different tag, but our header has
|
||||
* been sent out with old tag, and this way does confuse reply handling.
|
||||
*/
|
||||
static void nbd_sched_pending_work(struct nbd_device *nbd,
|
||||
struct nbd_sock *nsock,
|
||||
struct nbd_cmd *cmd, int sent)
|
||||
{
|
||||
struct request *req = blk_mq_rq_from_pdu(cmd);
|
||||
|
||||
/* pending work should be scheduled only once */
|
||||
WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags));
|
||||
|
||||
nsock->pending = req;
|
||||
nsock->sent = sent;
|
||||
set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
|
||||
refcount_inc(&nbd->config_refs);
|
||||
schedule_work(&nsock->work);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns BLK_STS_RESOURCE if the caller should retry after a delay.
|
||||
* Returns BLK_STS_IOERR if sending failed.
|
||||
@ -699,8 +720,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
|
||||
* completely done.
|
||||
*/
|
||||
if (sent) {
|
||||
nsock->pending = req;
|
||||
nsock->sent = sent;
|
||||
nbd_sched_pending_work(nbd, nsock, cmd, sent);
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
set_bit(NBD_CMD_REQUEUED, &cmd->flags);
|
||||
return BLK_STS_RESOURCE;
|
||||
@ -737,14 +758,8 @@ send_pages:
|
||||
result = sock_xmit(nbd, index, 1, &from, flags, &sent);
|
||||
if (result < 0) {
|
||||
if (was_interrupted(result)) {
|
||||
/* We've already sent the header, we
|
||||
* have no choice but to set pending and
|
||||
* return BUSY.
|
||||
*/
|
||||
nsock->pending = req;
|
||||
nsock->sent = sent;
|
||||
set_bit(NBD_CMD_REQUEUED, &cmd->flags);
|
||||
return BLK_STS_RESOURCE;
|
||||
nbd_sched_pending_work(nbd, nsock, cmd, sent);
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
dev_err(disk_to_dev(nbd->disk),
|
||||
"Send data failed (result %d)\n",
|
||||
@ -770,6 +785,14 @@ out:
|
||||
return BLK_STS_OK;
|
||||
|
||||
requeue:
|
||||
/*
|
||||
* Can't requeue in case we are dealing with partial send
|
||||
*
|
||||
* We must run from pending work function.
|
||||
* */
|
||||
if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
|
||||
return BLK_STS_OK;
|
||||
|
||||
/* retry on a different socket */
|
||||
dev_err_ratelimited(disk_to_dev(nbd->disk),
|
||||
"Request send failed, requeueing\n");
|
||||
@ -778,6 +801,44 @@ requeue:
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
/* handle partial sending */
|
||||
static void nbd_pending_cmd_work(struct work_struct *work)
|
||||
{
|
||||
struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
|
||||
struct request *req = nsock->pending;
|
||||
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
|
||||
struct nbd_device *nbd = cmd->nbd;
|
||||
unsigned long deadline = READ_ONCE(req->deadline);
|
||||
unsigned int wait_ms = 2;
|
||||
|
||||
mutex_lock(&cmd->lock);
|
||||
|
||||
WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
|
||||
if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)))
|
||||
goto out;
|
||||
|
||||
mutex_lock(&nsock->tx_lock);
|
||||
while (true) {
|
||||
nbd_send_cmd(nbd, cmd, cmd->index);
|
||||
if (!nsock->pending)
|
||||
break;
|
||||
|
||||
/* don't bother timeout handler for partial sending */
|
||||
if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
|
||||
cmd->status = BLK_STS_IOERR;
|
||||
blk_mq_complete_request(req);
|
||||
break;
|
||||
}
|
||||
msleep(wait_ms);
|
||||
wait_ms *= 2;
|
||||
}
|
||||
mutex_unlock(&nsock->tx_lock);
|
||||
clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
|
||||
out:
|
||||
mutex_unlock(&cmd->lock);
|
||||
nbd_config_put(nbd);
|
||||
}
|
||||
|
||||
static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
|
||||
struct nbd_reply *reply)
|
||||
{
|
||||
@ -1224,6 +1285,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
|
||||
nsock->pending = NULL;
|
||||
nsock->sent = 0;
|
||||
nsock->cookie = 0;
|
||||
INIT_WORK(&nsock->work, nbd_pending_cmd_work);
|
||||
socks[config->num_connections++] = nsock;
|
||||
atomic_inc(&config->live_connections);
|
||||
blk_mq_unfreeze_queue(nbd->disk->queue);
|
||||
@ -1841,8 +1903,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
|
||||
nbd->tag_set.queue_depth = 128;
|
||||
nbd->tag_set.numa_node = NUMA_NO_NODE;
|
||||
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
|
||||
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
|
||||
BLK_MQ_F_BLOCKING;
|
||||
nbd->tag_set.flags = BLK_MQ_F_BLOCKING;
|
||||
nbd->tag_set.driver_data = nbd;
|
||||
INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
|
||||
nbd->backend = NULL;
|
||||
@ -2180,6 +2241,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
|
||||
flush_workqueue(nbd->recv_workq);
|
||||
nbd_clear_que(nbd);
|
||||
nbd->task_setup = NULL;
|
||||
clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags);
|
||||
mutex_unlock(&nbd->config_lock);
|
||||
|
||||
if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
|
||||
|
@ -266,6 +266,10 @@ static bool g_zone_full;
|
||||
module_param_named(zone_full, g_zone_full, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false");
|
||||
|
||||
static bool g_rotational;
|
||||
module_param_named(rotational, g_rotational, bool, S_IRUGO);
|
||||
MODULE_PARM_DESC(rotational, "Set the rotational feature for the device. Default: false");
|
||||
|
||||
static struct nullb_device *null_alloc_dev(void);
|
||||
static void null_free_dev(struct nullb_device *dev);
|
||||
static void null_del_dev(struct nullb *nullb);
|
||||
@ -468,6 +472,7 @@ NULLB_DEVICE_ATTR(no_sched, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(fua, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(rotational, bool, NULL);
|
||||
|
||||
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
|
||||
{
|
||||
@ -621,6 +626,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
|
||||
&nullb_device_attr_shared_tags,
|
||||
&nullb_device_attr_shared_tag_bitmap,
|
||||
&nullb_device_attr_fua,
|
||||
&nullb_device_attr_rotational,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -706,7 +712,8 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
|
||||
"shared_tags,size,submit_queues,use_per_node_hctx,"
|
||||
"virt_boundary,zoned,zone_capacity,zone_max_active,"
|
||||
"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
|
||||
"zone_size,zone_append_max_sectors,zone_full\n");
|
||||
"zone_size,zone_append_max_sectors,zone_full,"
|
||||
"rotational\n");
|
||||
}
|
||||
|
||||
CONFIGFS_ATTR_RO(memb_group_, features);
|
||||
@ -793,6 +800,7 @@ static struct nullb_device *null_alloc_dev(void)
|
||||
dev->shared_tags = g_shared_tags;
|
||||
dev->shared_tag_bitmap = g_shared_tag_bitmap;
|
||||
dev->fua = g_fua;
|
||||
dev->rotational = g_rotational;
|
||||
|
||||
return dev;
|
||||
}
|
||||
@ -899,7 +907,7 @@ static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
|
||||
if (radix_tree_insert(root, idx, t_page)) {
|
||||
null_free_page(t_page);
|
||||
t_page = radix_tree_lookup(root, idx);
|
||||
WARN_ON(!t_page || t_page->page->index != idx);
|
||||
WARN_ON(!t_page || t_page->page->private != idx);
|
||||
} else if (is_cache)
|
||||
nullb->dev->curr_cache += PAGE_SIZE;
|
||||
|
||||
@ -922,7 +930,7 @@ static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
|
||||
(void **)t_pages, pos, FREE_BATCH);
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
pos = t_pages[i]->page->index;
|
||||
pos = t_pages[i]->page->private;
|
||||
ret = radix_tree_delete_item(root, pos, t_pages[i]);
|
||||
WARN_ON(ret != t_pages[i]);
|
||||
null_free_page(ret);
|
||||
@ -948,7 +956,7 @@ static struct nullb_page *__null_lookup_page(struct nullb *nullb,
|
||||
|
||||
root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
|
||||
t_page = radix_tree_lookup(root, idx);
|
||||
WARN_ON(t_page && t_page->page->index != idx);
|
||||
WARN_ON(t_page && t_page->page->private != idx);
|
||||
|
||||
if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
|
||||
return t_page;
|
||||
@ -991,7 +999,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb,
|
||||
|
||||
spin_lock_irq(&nullb->lock);
|
||||
idx = sector >> PAGE_SECTORS_SHIFT;
|
||||
t_page->page->index = idx;
|
||||
t_page->page->private = idx;
|
||||
t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
|
||||
radix_tree_preload_end();
|
||||
|
||||
@ -1011,7 +1019,7 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
|
||||
struct nullb_page *t_page, *ret;
|
||||
void *dst, *src;
|
||||
|
||||
idx = c_page->page->index;
|
||||
idx = c_page->page->private;
|
||||
|
||||
t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
|
||||
|
||||
@ -1070,7 +1078,7 @@ again:
|
||||
* avoid race, we don't allow page free
|
||||
*/
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
nullb->cache_flush_pos = c_pages[i]->page->index;
|
||||
nullb->cache_flush_pos = c_pages[i]->page->private;
|
||||
/*
|
||||
* We found the page which is being flushed to disk by other
|
||||
* threads
|
||||
@ -1783,9 +1791,8 @@ static int null_init_global_tag_set(void)
|
||||
tag_set.nr_hw_queues = g_submit_queues;
|
||||
tag_set.queue_depth = g_hw_queue_depth;
|
||||
tag_set.numa_node = g_home_node;
|
||||
tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
if (g_no_sched)
|
||||
tag_set.flags |= BLK_MQ_F_NO_SCHED;
|
||||
tag_set.flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT;
|
||||
if (g_shared_tag_bitmap)
|
||||
tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED;
|
||||
if (g_blocking)
|
||||
@ -1809,9 +1816,8 @@ static int null_setup_tagset(struct nullb *nullb)
|
||||
nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues;
|
||||
nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth;
|
||||
nullb->tag_set->numa_node = nullb->dev->home_node;
|
||||
nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
if (nullb->dev->no_sched)
|
||||
nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED;
|
||||
nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT;
|
||||
if (nullb->dev->shared_tag_bitmap)
|
||||
nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
|
||||
if (nullb->dev->blocking)
|
||||
@ -1938,6 +1944,9 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
lim.features |= BLK_FEAT_FUA;
|
||||
}
|
||||
|
||||
if (dev->rotational)
|
||||
lim.features |= BLK_FEAT_ROTATIONAL;
|
||||
|
||||
nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb);
|
||||
if (IS_ERR(nullb->disk)) {
|
||||
rv = PTR_ERR(nullb->disk);
|
||||
|
@ -107,6 +107,7 @@ struct nullb_device {
|
||||
bool shared_tags; /* share tag set between devices for blk-mq */
|
||||
bool shared_tag_bitmap; /* use hostwide shared tags */
|
||||
bool fua; /* Support FUA */
|
||||
bool rotational; /* Fake rotational device */
|
||||
};
|
||||
|
||||
struct nullb {
|
||||
|
@ -384,9 +384,9 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
|
||||
unsigned int devidx;
|
||||
struct queue_limits lim = {
|
||||
.logical_block_size = dev->blk_size,
|
||||
.max_hw_sectors = dev->bounce_size >> 9,
|
||||
.max_hw_sectors = BOUNCE_SIZE >> 9,
|
||||
.max_segments = -1,
|
||||
.max_segment_size = dev->bounce_size,
|
||||
.max_segment_size = BOUNCE_SIZE,
|
||||
.dma_alignment = dev->blk_size - 1,
|
||||
.features = BLK_FEAT_WRITE_CACHE |
|
||||
BLK_FEAT_ROTATIONAL,
|
||||
@ -434,8 +434,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
|
||||
|
||||
ps3disk_identify(dev);
|
||||
|
||||
error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1,
|
||||
BLK_MQ_F_SHOULD_MERGE);
|
||||
error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1, 0);
|
||||
if (error)
|
||||
goto fail_teardown;
|
||||
|
||||
|
@ -4964,7 +4964,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
|
||||
rbd_dev->tag_set.ops = &rbd_mq_ops;
|
||||
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
|
||||
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
|
||||
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
|
||||
rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
|
||||
|
||||
|
@ -1209,8 +1209,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess)
|
||||
tag_set->ops = &rnbd_mq_ops;
|
||||
tag_set->queue_depth = sess->queue_depth;
|
||||
tag_set->numa_node = NUMA_NO_NODE;
|
||||
tag_set->flags = BLK_MQ_F_SHOULD_MERGE |
|
||||
BLK_MQ_F_TAG_QUEUE_SHARED;
|
||||
tag_set->flags = BLK_MQ_F_TAG_QUEUE_SHARED;
|
||||
tag_set->cmd_size = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE;
|
||||
|
||||
/* for HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL */
|
||||
|
@ -167,7 +167,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
|
||||
bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
|
||||
prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
|
||||
usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
|
||||
bio_set_prio(bio, prio);
|
||||
bio->bi_ioprio = prio;
|
||||
|
||||
submit_bio(bio);
|
||||
|
||||
|
@ -32,25 +32,31 @@ module! {
|
||||
license: "GPL v2",
|
||||
}
|
||||
|
||||
#[pin_data]
|
||||
struct NullBlkModule {
|
||||
_disk: Pin<KBox<Mutex<GenDisk<NullBlkDevice>>>>,
|
||||
#[pin]
|
||||
_disk: Mutex<GenDisk<NullBlkDevice>>,
|
||||
}
|
||||
|
||||
impl kernel::Module for NullBlkModule {
|
||||
fn init(_module: &'static ThisModule) -> Result<Self> {
|
||||
impl kernel::InPlaceModule for NullBlkModule {
|
||||
fn init(_module: &'static ThisModule) -> impl PinInit<Self, Error> {
|
||||
pr_info!("Rust null_blk loaded\n");
|
||||
let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
|
||||
|
||||
let disk = gen_disk::GenDiskBuilder::new()
|
||||
.capacity_sectors(4096 << 11)
|
||||
.logical_block_size(4096)?
|
||||
.physical_block_size(4096)?
|
||||
.rotational(false)
|
||||
.build(format_args!("rnullb{}", 0), tagset)?;
|
||||
// Use a immediately-called closure as a stable `try` block
|
||||
let disk = /* try */ (|| {
|
||||
let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
|
||||
|
||||
let disk = KBox::pin_init(new_mutex!(disk, "nullb:disk"), flags::GFP_KERNEL)?;
|
||||
gen_disk::GenDiskBuilder::new()
|
||||
.capacity_sectors(4096 << 11)
|
||||
.logical_block_size(4096)?
|
||||
.physical_block_size(4096)?
|
||||
.rotational(false)
|
||||
.build(format_args!("rnullb{}", 0), tagset)
|
||||
})();
|
||||
|
||||
Ok(Self { _disk: disk })
|
||||
try_pin_init!(Self {
|
||||
_disk <- new_mutex!(disk?, "nullb:disk"),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -829,7 +829,7 @@ static int probe_disk(struct vdc_port *port)
|
||||
}
|
||||
|
||||
err = blk_mq_alloc_sq_tag_set(&port->tag_set, &vdc_mq_ops,
|
||||
VDC_TX_RING_SIZE, BLK_MQ_F_SHOULD_MERGE);
|
||||
VDC_TX_RING_SIZE, 0);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
|
@ -818,7 +818,7 @@ static int swim_floppy_init(struct swim_priv *swd)
|
||||
|
||||
for (drive = 0; drive < swd->floppy_count; drive++) {
|
||||
err = blk_mq_alloc_sq_tag_set(&swd->unit[drive].tag_set,
|
||||
&swim_mq_ops, 2, BLK_MQ_F_SHOULD_MERGE);
|
||||
&swim_mq_ops, 2, 0);
|
||||
if (err)
|
||||
goto exit_put_disks;
|
||||
|
||||
|
@ -1208,8 +1208,7 @@ static int swim3_attach(struct macio_dev *mdev,
|
||||
fs = &floppy_states[floppy_count];
|
||||
memset(fs, 0, sizeof(*fs));
|
||||
|
||||
rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2,
|
||||
BLK_MQ_F_SHOULD_MERGE);
|
||||
rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2, 0);
|
||||
if (rc)
|
||||
goto out_unregister;
|
||||
|
||||
|
@ -2213,7 +2213,6 @@ static int ublk_add_tag_set(struct ublk_device *ub)
|
||||
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
|
||||
ub->tag_set.numa_node = NUMA_NO_NODE;
|
||||
ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
|
||||
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
ub->tag_set.driver_data = ub;
|
||||
return blk_mq_alloc_tag_set(&ub->tag_set);
|
||||
}
|
||||
|
@ -13,7 +13,6 @@
|
||||
#include <linux/string_helpers.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/blk-mq-virtio.h>
|
||||
#include <linux/numa.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <uapi/linux/virtio_ring.h>
|
||||
@ -1106,9 +1105,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr,
|
||||
lim.features |= BLK_FEAT_WRITE_CACHE;
|
||||
else
|
||||
lim.features &= ~BLK_FEAT_WRITE_CACHE;
|
||||
blk_mq_freeze_queue(disk->queue);
|
||||
i = queue_limits_commit_update(disk->queue, &lim);
|
||||
blk_mq_unfreeze_queue(disk->queue);
|
||||
i = queue_limits_commit_update_frozen(disk->queue, &lim);
|
||||
if (i)
|
||||
return i;
|
||||
return count;
|
||||
@ -1181,7 +1178,8 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set)
|
||||
if (i == HCTX_TYPE_POLL)
|
||||
blk_mq_map_queues(&set->map[i]);
|
||||
else
|
||||
blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
|
||||
blk_mq_map_hw_queues(&set->map[i],
|
||||
&vblk->vdev->dev, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1481,7 +1479,6 @@ static int virtblk_probe(struct virtio_device *vdev)
|
||||
vblk->tag_set.ops = &virtio_mq_ops;
|
||||
vblk->tag_set.queue_depth = queue_depth;
|
||||
vblk->tag_set.numa_node = NUMA_NO_NODE;
|
||||
vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
vblk->tag_set.cmd_size =
|
||||
sizeof(struct virtblk_req) +
|
||||
sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
|
||||
|
@ -1131,7 +1131,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
|
||||
} else
|
||||
info->tag_set.queue_depth = BLK_RING_SIZE(info);
|
||||
info->tag_set.numa_node = NUMA_NO_NODE;
|
||||
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
info->tag_set.cmd_size = sizeof(struct blkif_req);
|
||||
info->tag_set.driver_data = info;
|
||||
|
||||
|
@ -354,7 +354,6 @@ static int __init z2_init(void)
|
||||
tag_set.nr_maps = 1;
|
||||
tag_set.queue_depth = 16;
|
||||
tag_set.numa_node = NUMA_NO_NODE;
|
||||
tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
ret = blk_mq_alloc_tag_set(&tag_set);
|
||||
if (ret)
|
||||
goto out_unregister_blkdev;
|
||||
|
@ -777,7 +777,7 @@ static int probe_gdrom(struct platform_device *devptr)
|
||||
probe_gdrom_setupcd();
|
||||
|
||||
err = blk_mq_alloc_sq_tag_set(&gd.tag_set, &gdrom_mq_ops, 1,
|
||||
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
|
||||
BLK_MQ_F_BLOCKING);
|
||||
if (err)
|
||||
goto probe_fail_free_cd_info;
|
||||
|
||||
|
@ -61,6 +61,19 @@ config MD_BITMAP_FILE
|
||||
various kernel APIs and can only work with files on a file system not
|
||||
actually sitting on the MD device.
|
||||
|
||||
config MD_LINEAR
|
||||
tristate "Linear (append) mode"
|
||||
depends on BLK_DEV_MD
|
||||
help
|
||||
If you say Y here, then your multiple devices driver will be able to
|
||||
use the so-called linear mode, i.e. it will combine the hard disk
|
||||
partitions by simply appending one to the other.
|
||||
|
||||
To compile this as a module, choose M here: the module
|
||||
will be called linear.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID0
|
||||
tristate "RAID-0 (striping) mode"
|
||||
depends on BLK_DEV_MD
|
||||
|
@ -29,12 +29,14 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
|
||||
|
||||
md-mod-y += md.o md-bitmap.o
|
||||
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
|
||||
linear-y += md-linear.o
|
||||
|
||||
# Note: link order is important. All raid personalities
|
||||
# and must come before md.o, as they each initialise
|
||||
# themselves, and md.o may use the personalities when it
|
||||
# auto-initialised.
|
||||
|
||||
obj-$(CONFIG_MD_LINEAR) += linear.o
|
||||
obj-$(CONFIG_MD_RAID0) += raid0.o
|
||||
obj-$(CONFIG_MD_RAID1) += raid1.o
|
||||
obj-$(CONFIG_MD_RAID10) += raid10.o
|
||||
|
@ -82,7 +82,7 @@ static void moving_init(struct moving_io *io)
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs,
|
||||
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
|
||||
bio_get(bio);
|
||||
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
||||
|
||||
bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
|
||||
bio->bi_private = &io->cl;
|
||||
|
@ -334,7 +334,7 @@ static void dirty_init(struct keybuf_key *w)
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs,
|
||||
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
|
||||
if (!io->dc->writeback_percent)
|
||||
bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
||||
|
||||
bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
|
||||
bio->bi_private = w;
|
||||
|
@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
|
||||
md->tag_set->ops = &dm_mq_ops;
|
||||
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
|
||||
md->tag_set->numa_node = md->numa_node_id;
|
||||
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
|
||||
md->tag_set->flags = BLK_MQ_F_STACKING;
|
||||
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
|
||||
md->tag_set->driver_data = md;
|
||||
|
||||
|
@ -122,7 +122,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
|
||||
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
|
||||
|
||||
par = fec_read_parity(v, rsb, block_offset, &offset,
|
||||
par_buf_offset, &buf, bio_prio(bio));
|
||||
par_buf_offset, &buf, bio->bi_ioprio);
|
||||
if (IS_ERR(par))
|
||||
return PTR_ERR(par);
|
||||
|
||||
@ -164,7 +164,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
|
||||
dm_bufio_release(buf);
|
||||
|
||||
par = fec_read_parity(v, rsb, block_offset, &offset,
|
||||
par_buf_offset, &buf, bio_prio(bio));
|
||||
par_buf_offset, &buf, bio->bi_ioprio);
|
||||
if (IS_ERR(par))
|
||||
return PTR_ERR(par);
|
||||
}
|
||||
@ -254,7 +254,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
|
||||
bufio = v->bufio;
|
||||
}
|
||||
|
||||
bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio_prio(bio));
|
||||
bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio);
|
||||
if (IS_ERR(bbuf)) {
|
||||
DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
|
||||
v->data_dev->name,
|
||||
|
@ -321,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
|
||||
}
|
||||
} else {
|
||||
data = dm_bufio_read_with_ioprio(v->bufio, hash_block,
|
||||
&buf, bio_prio(bio));
|
||||
&buf, bio->bi_ioprio);
|
||||
}
|
||||
|
||||
if (IS_ERR(data))
|
||||
@ -789,7 +789,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
|
||||
|
||||
verity_fec_init_io(io);
|
||||
|
||||
verity_submit_prefetch(v, io, bio_prio(bio));
|
||||
verity_submit_prefetch(v, io, bio->bi_ioprio);
|
||||
|
||||
submit_bio_noacct(bio);
|
||||
|
||||
|
@ -49,6 +49,7 @@ static int md_setup_ents __initdata;
|
||||
* instead of just one. -- KTK
|
||||
* 18May2000: Added support for persistent-superblock arrays:
|
||||
* md=n,0,factor,fault,device-list uses RAID0 for device n
|
||||
* md=n,-1,factor,fault,device-list uses LINEAR for device n
|
||||
* md=n,device-list reads a RAID superblock from the devices
|
||||
* elements in device-list are read by name_to_kdev_t so can be
|
||||
* a hex number or something like /dev/hda1 /dev/sdb
|
||||
@ -87,7 +88,7 @@ static int __init md_setup(char *str)
|
||||
md_setup_ents++;
|
||||
switch (get_option(&str, &level)) { /* RAID level */
|
||||
case 2: /* could be 0 or -1.. */
|
||||
if (level == 0) {
|
||||
if (level == 0 || level == LEVEL_LINEAR) {
|
||||
if (get_option(&str, &factor) != 2 || /* Chunk Size */
|
||||
get_option(&str, &fault) != 2) {
|
||||
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
|
||||
@ -95,7 +96,10 @@ static int __init md_setup(char *str)
|
||||
}
|
||||
md_setup_args[ent].level = level;
|
||||
md_setup_args[ent].chunk = 1 << (factor+12);
|
||||
pername = "raid0";
|
||||
if (level == LEVEL_LINEAR)
|
||||
pername = "linear";
|
||||
else
|
||||
pername = "raid0";
|
||||
break;
|
||||
}
|
||||
fallthrough;
|
||||
|
@ -682,7 +682,7 @@ static void bitmap_update_sb(void *data)
|
||||
return;
|
||||
if (!bitmap->storage.sb_page) /* no superblock */
|
||||
return;
|
||||
sb = kmap_atomic(bitmap->storage.sb_page);
|
||||
sb = kmap_local_page(bitmap->storage.sb_page);
|
||||
sb->events = cpu_to_le64(bitmap->mddev->events);
|
||||
if (bitmap->mddev->events < bitmap->events_cleared)
|
||||
/* rocking back to read-only */
|
||||
@ -702,7 +702,7 @@ static void bitmap_update_sb(void *data)
|
||||
sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
|
||||
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
|
||||
bitmap_info.space);
|
||||
kunmap_atomic(sb);
|
||||
kunmap_local(sb);
|
||||
|
||||
if (bitmap->storage.file)
|
||||
write_file_page(bitmap, bitmap->storage.sb_page, 1);
|
||||
@ -717,7 +717,7 @@ static void bitmap_print_sb(struct bitmap *bitmap)
|
||||
|
||||
if (!bitmap || !bitmap->storage.sb_page)
|
||||
return;
|
||||
sb = kmap_atomic(bitmap->storage.sb_page);
|
||||
sb = kmap_local_page(bitmap->storage.sb_page);
|
||||
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
|
||||
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
|
||||
pr_debug(" version: %u\n", le32_to_cpu(sb->version));
|
||||
@ -736,7 +736,7 @@ static void bitmap_print_sb(struct bitmap *bitmap)
|
||||
pr_debug(" sync size: %llu KB\n",
|
||||
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
|
||||
pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
|
||||
kunmap_atomic(sb);
|
||||
kunmap_local(sb);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -760,7 +760,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
|
||||
return -ENOMEM;
|
||||
bitmap->storage.sb_index = 0;
|
||||
|
||||
sb = kmap_atomic(bitmap->storage.sb_page);
|
||||
sb = kmap_local_page(bitmap->storage.sb_page);
|
||||
|
||||
sb->magic = cpu_to_le32(BITMAP_MAGIC);
|
||||
sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
|
||||
@ -768,7 +768,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
|
||||
chunksize = bitmap->mddev->bitmap_info.chunksize;
|
||||
BUG_ON(!chunksize);
|
||||
if (!is_power_of_2(chunksize)) {
|
||||
kunmap_atomic(sb);
|
||||
kunmap_local(sb);
|
||||
pr_warn("bitmap chunksize not a power of 2\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -803,7 +803,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
|
||||
sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
|
||||
bitmap->mddev->bitmap_info.nodes = 0;
|
||||
|
||||
kunmap_atomic(sb);
|
||||
kunmap_local(sb);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -865,7 +865,7 @@ re_read:
|
||||
return err;
|
||||
|
||||
err = -EINVAL;
|
||||
sb = kmap_atomic(sb_page);
|
||||
sb = kmap_local_page(sb_page);
|
||||
|
||||
chunksize = le32_to_cpu(sb->chunksize);
|
||||
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
|
||||
@ -932,7 +932,7 @@ re_read:
|
||||
err = 0;
|
||||
|
||||
out:
|
||||
kunmap_atomic(sb);
|
||||
kunmap_local(sb);
|
||||
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
|
||||
/* Assigning chunksize is required for "re_read" */
|
||||
bitmap->mddev->bitmap_info.chunksize = chunksize;
|
||||
@ -1161,12 +1161,12 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
|
||||
bit = file_page_offset(&bitmap->storage, chunk);
|
||||
|
||||
/* set the bit */
|
||||
kaddr = kmap_atomic(page);
|
||||
kaddr = kmap_local_page(page);
|
||||
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
|
||||
set_bit(bit, kaddr);
|
||||
else
|
||||
set_bit_le(bit, kaddr);
|
||||
kunmap_atomic(kaddr);
|
||||
kunmap_local(kaddr);
|
||||
pr_debug("set file bit %lu page %lu\n", bit, index);
|
||||
/* record page number so it gets flushed to disk when unplug occurs */
|
||||
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
|
||||
@ -1190,12 +1190,12 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
|
||||
if (!page)
|
||||
return;
|
||||
bit = file_page_offset(&bitmap->storage, chunk);
|
||||
paddr = kmap_atomic(page);
|
||||
paddr = kmap_local_page(page);
|
||||
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
|
||||
clear_bit(bit, paddr);
|
||||
else
|
||||
clear_bit_le(bit, paddr);
|
||||
kunmap_atomic(paddr);
|
||||
kunmap_local(paddr);
|
||||
if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
|
||||
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
|
||||
bitmap->allclean = 0;
|
||||
@ -1214,12 +1214,12 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
|
||||
if (!page)
|
||||
return -EINVAL;
|
||||
bit = file_page_offset(&bitmap->storage, chunk);
|
||||
paddr = kmap_atomic(page);
|
||||
paddr = kmap_local_page(page);
|
||||
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
|
||||
set = test_bit(bit, paddr);
|
||||
else
|
||||
set = test_bit_le(bit, paddr);
|
||||
kunmap_atomic(paddr);
|
||||
kunmap_local(paddr);
|
||||
return set;
|
||||
}
|
||||
|
||||
@ -1388,9 +1388,9 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
|
||||
* If the bitmap is out of date, dirty the whole page
|
||||
* and write it out
|
||||
*/
|
||||
paddr = kmap_atomic(page);
|
||||
paddr = kmap_local_page(page);
|
||||
memset(paddr + offset, 0xff, PAGE_SIZE - offset);
|
||||
kunmap_atomic(paddr);
|
||||
kunmap_local(paddr);
|
||||
|
||||
filemap_write_page(bitmap, i, true);
|
||||
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
|
||||
@ -1406,12 +1406,12 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
|
||||
void *paddr;
|
||||
bool was_set;
|
||||
|
||||
paddr = kmap_atomic(page);
|
||||
paddr = kmap_local_page(page);
|
||||
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
|
||||
was_set = test_bit(bit, paddr);
|
||||
else
|
||||
was_set = test_bit_le(bit, paddr);
|
||||
kunmap_atomic(paddr);
|
||||
kunmap_local(paddr);
|
||||
|
||||
if (was_set) {
|
||||
/* if the disk bit is set, set the memory bit */
|
||||
@ -1546,10 +1546,10 @@ static void bitmap_daemon_work(struct mddev *mddev)
|
||||
bitmap_super_t *sb;
|
||||
bitmap->need_sync = 0;
|
||||
if (bitmap->storage.filemap) {
|
||||
sb = kmap_atomic(bitmap->storage.sb_page);
|
||||
sb = kmap_local_page(bitmap->storage.sb_page);
|
||||
sb->events_cleared =
|
||||
cpu_to_le64(bitmap->events_cleared);
|
||||
kunmap_atomic(sb);
|
||||
kunmap_local(sb);
|
||||
set_page_attr(bitmap, 0,
|
||||
BITMAP_PAGE_NEEDWRITE);
|
||||
}
|
||||
@ -1671,24 +1671,13 @@ __acquires(bitmap->lock)
|
||||
}
|
||||
|
||||
static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
|
||||
unsigned long sectors, bool behind)
|
||||
unsigned long sectors)
|
||||
{
|
||||
struct bitmap *bitmap = mddev->bitmap;
|
||||
|
||||
if (!bitmap)
|
||||
return 0;
|
||||
|
||||
if (behind) {
|
||||
int bw;
|
||||
atomic_inc(&bitmap->behind_writes);
|
||||
bw = atomic_read(&bitmap->behind_writes);
|
||||
if (bw > bitmap->behind_writes_used)
|
||||
bitmap->behind_writes_used = bw;
|
||||
|
||||
pr_debug("inc write-behind count %d/%lu\n",
|
||||
bw, bitmap->mddev->bitmap_info.max_write_behind);
|
||||
}
|
||||
|
||||
while (sectors) {
|
||||
sector_t blocks;
|
||||
bitmap_counter_t *bmc;
|
||||
@ -1737,21 +1726,13 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
|
||||
}
|
||||
|
||||
static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
|
||||
unsigned long sectors, bool success, bool behind)
|
||||
unsigned long sectors)
|
||||
{
|
||||
struct bitmap *bitmap = mddev->bitmap;
|
||||
|
||||
if (!bitmap)
|
||||
return;
|
||||
|
||||
if (behind) {
|
||||
if (atomic_dec_and_test(&bitmap->behind_writes))
|
||||
wake_up(&bitmap->behind_wait);
|
||||
pr_debug("dec write-behind count %d/%lu\n",
|
||||
atomic_read(&bitmap->behind_writes),
|
||||
bitmap->mddev->bitmap_info.max_write_behind);
|
||||
}
|
||||
|
||||
while (sectors) {
|
||||
sector_t blocks;
|
||||
unsigned long flags;
|
||||
@ -1764,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
|
||||
return;
|
||||
}
|
||||
|
||||
if (success && !bitmap->mddev->degraded &&
|
||||
bitmap->events_cleared < bitmap->mddev->events) {
|
||||
bitmap->events_cleared = bitmap->mddev->events;
|
||||
bitmap->need_sync = 1;
|
||||
sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
|
||||
}
|
||||
|
||||
if (!success && !NEEDED(*bmc))
|
||||
if (!bitmap->mddev->degraded) {
|
||||
if (bitmap->events_cleared < bitmap->mddev->events) {
|
||||
bitmap->events_cleared = bitmap->mddev->events;
|
||||
bitmap->need_sync = 1;
|
||||
sysfs_notify_dirent_safe(
|
||||
bitmap->sysfs_can_clear);
|
||||
}
|
||||
} else if (!NEEDED(*bmc)) {
|
||||
*bmc |= NEEDED_MASK;
|
||||
}
|
||||
|
||||
if (COUNTER(*bmc) == COUNTER_MAX)
|
||||
wake_up(&bitmap->overflow_wait);
|
||||
@ -2062,6 +2044,37 @@ static void md_bitmap_free(void *data)
|
||||
kfree(bitmap);
|
||||
}
|
||||
|
||||
static void bitmap_start_behind_write(struct mddev *mddev)
|
||||
{
|
||||
struct bitmap *bitmap = mddev->bitmap;
|
||||
int bw;
|
||||
|
||||
if (!bitmap)
|
||||
return;
|
||||
|
||||
atomic_inc(&bitmap->behind_writes);
|
||||
bw = atomic_read(&bitmap->behind_writes);
|
||||
if (bw > bitmap->behind_writes_used)
|
||||
bitmap->behind_writes_used = bw;
|
||||
|
||||
pr_debug("inc write-behind count %d/%lu\n",
|
||||
bw, bitmap->mddev->bitmap_info.max_write_behind);
|
||||
}
|
||||
|
||||
static void bitmap_end_behind_write(struct mddev *mddev)
|
||||
{
|
||||
struct bitmap *bitmap = mddev->bitmap;
|
||||
|
||||
if (!bitmap)
|
||||
return;
|
||||
|
||||
if (atomic_dec_and_test(&bitmap->behind_writes))
|
||||
wake_up(&bitmap->behind_wait);
|
||||
pr_debug("dec write-behind count %d/%lu\n",
|
||||
atomic_read(&bitmap->behind_writes),
|
||||
bitmap->mddev->bitmap_info.max_write_behind);
|
||||
}
|
||||
|
||||
static void bitmap_wait_behind_writes(struct mddev *mddev)
|
||||
{
|
||||
struct bitmap *bitmap = mddev->bitmap;
|
||||
@ -2981,6 +2994,9 @@ static struct bitmap_operations bitmap_ops = {
|
||||
.dirty_bits = bitmap_dirty_bits,
|
||||
.unplug = bitmap_unplug,
|
||||
.daemon_work = bitmap_daemon_work,
|
||||
|
||||
.start_behind_write = bitmap_start_behind_write,
|
||||
.end_behind_write = bitmap_end_behind_write,
|
||||
.wait_behind_writes = bitmap_wait_behind_writes,
|
||||
|
||||
.startwrite = bitmap_startwrite,
|
||||
|
@ -84,12 +84,15 @@ struct bitmap_operations {
|
||||
unsigned long e);
|
||||
void (*unplug)(struct mddev *mddev, bool sync);
|
||||
void (*daemon_work)(struct mddev *mddev);
|
||||
|
||||
void (*start_behind_write)(struct mddev *mddev);
|
||||
void (*end_behind_write)(struct mddev *mddev);
|
||||
void (*wait_behind_writes)(struct mddev *mddev);
|
||||
|
||||
int (*startwrite)(struct mddev *mddev, sector_t offset,
|
||||
unsigned long sectors, bool behind);
|
||||
unsigned long sectors);
|
||||
void (*endwrite)(struct mddev *mddev, sector_t offset,
|
||||
unsigned long sectors, bool success, bool behind);
|
||||
unsigned long sectors);
|
||||
bool (*start_sync)(struct mddev *mddev, sector_t offset,
|
||||
sector_t *blocks, bool degraded);
|
||||
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
|
||||
|
354
drivers/md/md-linear.c
Normal file
354
drivers/md/md-linear.c
Normal file
@ -0,0 +1,354 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
|
||||
* ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/raid/md_u.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <trace/events/block.h>
|
||||
#include "md.h"
|
||||
|
||||
struct dev_info {
|
||||
struct md_rdev *rdev;
|
||||
sector_t end_sector;
|
||||
};
|
||||
|
||||
struct linear_conf {
|
||||
struct rcu_head rcu;
|
||||
sector_t array_sectors;
|
||||
/* a copy of mddev->raid_disks */
|
||||
int raid_disks;
|
||||
struct dev_info disks[] __counted_by(raid_disks);
|
||||
};
|
||||
|
||||
/*
|
||||
* find which device holds a particular offset
|
||||
*/
|
||||
static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
|
||||
{
|
||||
int lo, mid, hi;
|
||||
struct linear_conf *conf;
|
||||
|
||||
lo = 0;
|
||||
hi = mddev->raid_disks - 1;
|
||||
conf = mddev->private;
|
||||
|
||||
/*
|
||||
* Binary Search
|
||||
*/
|
||||
|
||||
while (hi > lo) {
|
||||
|
||||
mid = (hi + lo) / 2;
|
||||
if (sector < conf->disks[mid].end_sector)
|
||||
hi = mid;
|
||||
else
|
||||
lo = mid + 1;
|
||||
}
|
||||
|
||||
return conf->disks + lo;
|
||||
}
|
||||
|
||||
static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
|
||||
{
|
||||
struct linear_conf *conf;
|
||||
sector_t array_sectors;
|
||||
|
||||
conf = mddev->private;
|
||||
WARN_ONCE(sectors || raid_disks,
|
||||
"%s does not support generic reshape\n", __func__);
|
||||
array_sectors = conf->array_sectors;
|
||||
|
||||
return array_sectors;
|
||||
}
|
||||
|
||||
static int linear_set_limits(struct mddev *mddev)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
int err;
|
||||
|
||||
md_init_stacking_limits(&lim);
|
||||
lim.max_hw_sectors = mddev->chunk_sectors;
|
||||
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
|
||||
lim.io_min = mddev->chunk_sectors << 9;
|
||||
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
|
||||
if (err) {
|
||||
queue_limits_cancel_update(mddev->gendisk->queue);
|
||||
return err;
|
||||
}
|
||||
|
||||
return queue_limits_set(mddev->gendisk->queue, &lim);
|
||||
}
|
||||
|
||||
static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
|
||||
{
|
||||
struct linear_conf *conf;
|
||||
struct md_rdev *rdev;
|
||||
int ret = -EINVAL;
|
||||
int cnt;
|
||||
int i;
|
||||
|
||||
conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
|
||||
if (!conf)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* conf->raid_disks is copy of mddev->raid_disks. The reason to
|
||||
* keep a copy of mddev->raid_disks in struct linear_conf is,
|
||||
* mddev->raid_disks may not be consistent with pointers number of
|
||||
* conf->disks[] when it is updated in linear_add() and used to
|
||||
* iterate old conf->disks[] earray in linear_congested().
|
||||
* Here conf->raid_disks is always consitent with number of
|
||||
* pointers in conf->disks[] array, and mddev->private is updated
|
||||
* with rcu_assign_pointer() in linear_addr(), such race can be
|
||||
* avoided.
|
||||
*/
|
||||
conf->raid_disks = raid_disks;
|
||||
|
||||
cnt = 0;
|
||||
conf->array_sectors = 0;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
int j = rdev->raid_disk;
|
||||
struct dev_info *disk = conf->disks + j;
|
||||
sector_t sectors;
|
||||
|
||||
if (j < 0 || j >= raid_disks || disk->rdev) {
|
||||
pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
|
||||
mdname(mddev));
|
||||
goto out;
|
||||
}
|
||||
|
||||
disk->rdev = rdev;
|
||||
if (mddev->chunk_sectors) {
|
||||
sectors = rdev->sectors;
|
||||
sector_div(sectors, mddev->chunk_sectors);
|
||||
rdev->sectors = sectors * mddev->chunk_sectors;
|
||||
}
|
||||
|
||||
conf->array_sectors += rdev->sectors;
|
||||
cnt++;
|
||||
}
|
||||
if (cnt != raid_disks) {
|
||||
pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
|
||||
mdname(mddev));
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we calculate the device offsets.
|
||||
*/
|
||||
conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
|
||||
|
||||
for (i = 1; i < raid_disks; i++)
|
||||
conf->disks[i].end_sector =
|
||||
conf->disks[i-1].end_sector +
|
||||
conf->disks[i].rdev->sectors;
|
||||
|
||||
if (!mddev_is_dm(mddev)) {
|
||||
ret = linear_set_limits(mddev);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
return conf;
|
||||
|
||||
out:
|
||||
kfree(conf);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int linear_run(struct mddev *mddev)
|
||||
{
|
||||
struct linear_conf *conf;
|
||||
int ret;
|
||||
|
||||
if (md_check_no_bitmap(mddev))
|
||||
return -EINVAL;
|
||||
|
||||
conf = linear_conf(mddev, mddev->raid_disks);
|
||||
if (IS_ERR(conf))
|
||||
return PTR_ERR(conf);
|
||||
|
||||
mddev->private = conf;
|
||||
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
|
||||
|
||||
ret = md_integrity_register(mddev);
|
||||
if (ret) {
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
/* Adding a drive to a linear array allows the array to grow.
|
||||
* It is permitted if the new drive has a matching superblock
|
||||
* already on it, with raid_disk equal to raid_disks.
|
||||
* It is achieved by creating a new linear_private_data structure
|
||||
* and swapping it in in-place of the current one.
|
||||
* The current one is never freed until the array is stopped.
|
||||
* This avoids races.
|
||||
*/
|
||||
struct linear_conf *newconf, *oldconf;
|
||||
|
||||
if (rdev->saved_raid_disk != mddev->raid_disks)
|
||||
return -EINVAL;
|
||||
|
||||
rdev->raid_disk = rdev->saved_raid_disk;
|
||||
rdev->saved_raid_disk = -1;
|
||||
|
||||
newconf = linear_conf(mddev, mddev->raid_disks + 1);
|
||||
if (!newconf)
|
||||
return -ENOMEM;
|
||||
|
||||
/* newconf->raid_disks already keeps a copy of * the increased
|
||||
* value of mddev->raid_disks, WARN_ONCE() is just used to make
|
||||
* sure of this. It is possible that oldconf is still referenced
|
||||
* in linear_congested(), therefore kfree_rcu() is used to free
|
||||
* oldconf until no one uses it anymore.
|
||||
*/
|
||||
oldconf = rcu_dereference_protected(mddev->private,
|
||||
lockdep_is_held(&mddev->reconfig_mutex));
|
||||
mddev->raid_disks++;
|
||||
WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
|
||||
"copied raid_disks doesn't match mddev->raid_disks");
|
||||
rcu_assign_pointer(mddev->private, newconf);
|
||||
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
|
||||
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
|
||||
kfree_rcu(oldconf, rcu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void linear_free(struct mddev *mddev, void *priv)
|
||||
{
|
||||
struct linear_conf *conf = priv;
|
||||
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
static bool linear_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct dev_info *tmp_dev;
|
||||
sector_t start_sector, end_sector, data_offset;
|
||||
sector_t bio_sector = bio->bi_iter.bi_sector;
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
|
||||
&& md_flush_request(mddev, bio))
|
||||
return true;
|
||||
|
||||
tmp_dev = which_dev(mddev, bio_sector);
|
||||
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
|
||||
end_sector = tmp_dev->end_sector;
|
||||
data_offset = tmp_dev->rdev->data_offset;
|
||||
|
||||
if (unlikely(bio_sector >= end_sector ||
|
||||
bio_sector < start_sector))
|
||||
goto out_of_bounds;
|
||||
|
||||
if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
|
||||
md_error(mddev, tmp_dev->rdev);
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unlikely(bio_end_sector(bio) > end_sector)) {
|
||||
/* This bio crosses a device boundary, so we have to split it */
|
||||
struct bio *split = bio_split(bio, end_sector - bio_sector,
|
||||
GFP_NOIO, &mddev->bio_set);
|
||||
|
||||
if (IS_ERR(split)) {
|
||||
bio->bi_status = errno_to_blk_status(PTR_ERR(split));
|
||||
bio_endio(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
bio_chain(split, bio);
|
||||
submit_bio_noacct(bio);
|
||||
bio = split;
|
||||
}
|
||||
|
||||
md_account_bio(mddev, &bio);
|
||||
bio_set_dev(bio, tmp_dev->rdev->bdev);
|
||||
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
|
||||
start_sector + data_offset;
|
||||
|
||||
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
|
||||
!bdev_max_discard_sectors(bio->bi_bdev))) {
|
||||
/* Just ignore it */
|
||||
bio_endio(bio);
|
||||
} else {
|
||||
if (mddev->gendisk)
|
||||
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
|
||||
bio_sector);
|
||||
mddev_check_write_zeroes(mddev, bio);
|
||||
submit_bio_noacct(bio);
|
||||
}
|
||||
return true;
|
||||
|
||||
out_of_bounds:
|
||||
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
|
||||
mdname(mddev),
|
||||
(unsigned long long)bio->bi_iter.bi_sector,
|
||||
tmp_dev->rdev->bdev,
|
||||
(unsigned long long)tmp_dev->rdev->sectors,
|
||||
(unsigned long long)start_sector);
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void linear_status(struct seq_file *seq, struct mddev *mddev)
|
||||
{
|
||||
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
|
||||
}
|
||||
|
||||
static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
|
||||
char *md_name = mdname(mddev);
|
||||
|
||||
pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
|
||||
md_name, rdev->bdev);
|
||||
}
|
||||
}
|
||||
|
||||
static void linear_quiesce(struct mddev *mddev, int state)
|
||||
{
|
||||
}
|
||||
|
||||
static struct md_personality linear_personality = {
|
||||
.name = "linear",
|
||||
.level = LEVEL_LINEAR,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = linear_make_request,
|
||||
.run = linear_run,
|
||||
.free = linear_free,
|
||||
.status = linear_status,
|
||||
.hot_add_disk = linear_add,
|
||||
.size = linear_size,
|
||||
.quiesce = linear_quiesce,
|
||||
.error_handler = linear_error,
|
||||
};
|
||||
|
||||
static int __init linear_init(void)
|
||||
{
|
||||
return register_md_personality(&linear_personality);
|
||||
}
|
||||
|
||||
static void linear_exit(void)
|
||||
{
|
||||
unregister_md_personality(&linear_personality);
|
||||
}
|
||||
|
||||
module_init(linear_init);
|
||||
module_exit(linear_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
|
||||
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
|
||||
MODULE_ALIAS("md-linear");
|
||||
MODULE_ALIAS("md-level--1");
|
@ -8124,7 +8124,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
return;
|
||||
mddev->pers->error_handler(mddev, rdev);
|
||||
|
||||
if (mddev->pers->level == 0)
|
||||
if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
|
||||
return;
|
||||
|
||||
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
|
||||
@ -8745,12 +8745,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
|
||||
|
||||
static void md_bitmap_start(struct mddev *mddev,
|
||||
struct md_io_clone *md_io_clone)
|
||||
{
|
||||
if (mddev->pers->bitmap_sector)
|
||||
mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
|
||||
&md_io_clone->sectors);
|
||||
|
||||
mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
|
||||
md_io_clone->sectors);
|
||||
}
|
||||
|
||||
static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
|
||||
{
|
||||
mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
|
||||
md_io_clone->sectors);
|
||||
}
|
||||
|
||||
static void md_end_clone_io(struct bio *bio)
|
||||
{
|
||||
struct md_io_clone *md_io_clone = bio->bi_private;
|
||||
struct bio *orig_bio = md_io_clone->orig_bio;
|
||||
struct mddev *mddev = md_io_clone->mddev;
|
||||
|
||||
if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
|
||||
md_bitmap_end(mddev, md_io_clone);
|
||||
|
||||
if (bio->bi_status && !orig_bio->bi_status)
|
||||
orig_bio->bi_status = bio->bi_status;
|
||||
|
||||
@ -8775,6 +8795,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
|
||||
if (blk_queue_io_stat(bdev->bd_disk->queue))
|
||||
md_io_clone->start_time = bio_start_io_acct(*bio);
|
||||
|
||||
if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
|
||||
md_io_clone->offset = (*bio)->bi_iter.bi_sector;
|
||||
md_io_clone->sectors = bio_sectors(*bio);
|
||||
md_bitmap_start(mddev, md_io_clone);
|
||||
}
|
||||
|
||||
clone->bi_end_io = md_end_clone_io;
|
||||
clone->bi_private = md_io_clone;
|
||||
*bio = clone;
|
||||
@ -8793,6 +8819,9 @@ void md_free_cloned_bio(struct bio *bio)
|
||||
struct bio *orig_bio = md_io_clone->orig_bio;
|
||||
struct mddev *mddev = md_io_clone->mddev;
|
||||
|
||||
if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
|
||||
md_bitmap_end(mddev, md_io_clone);
|
||||
|
||||
if (bio->bi_status && !orig_bio->bi_status)
|
||||
orig_bio->bi_status = bio->bi_status;
|
||||
|
||||
|
@ -746,6 +746,9 @@ struct md_personality
|
||||
void *(*takeover) (struct mddev *mddev);
|
||||
/* Changes the consistency policy of an active array. */
|
||||
int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
|
||||
/* convert io ranges from array to bitmap */
|
||||
void (*bitmap_sector)(struct mddev *mddev, sector_t *offset,
|
||||
unsigned long *sectors);
|
||||
};
|
||||
|
||||
struct md_sysfs_entry {
|
||||
@ -828,6 +831,8 @@ struct md_io_clone {
|
||||
struct mddev *mddev;
|
||||
struct bio *orig_bio;
|
||||
unsigned long start_time;
|
||||
sector_t offset;
|
||||
unsigned long sectors;
|
||||
struct bio bio_clone;
|
||||
};
|
||||
|
||||
|
@ -420,10 +420,8 @@ static void close_write(struct r1bio *r1_bio)
|
||||
r1_bio->behind_master_bio = NULL;
|
||||
}
|
||||
|
||||
/* clear the bitmap if all writes complete successfully */
|
||||
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
|
||||
!test_bit(R1BIO_Degraded, &r1_bio->state),
|
||||
test_bit(R1BIO_BehindIO, &r1_bio->state));
|
||||
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
|
||||
mddev->bitmap_ops->end_behind_write(mddev);
|
||||
md_write_end(mddev);
|
||||
}
|
||||
|
||||
@ -480,8 +478,6 @@ static void raid1_end_write_request(struct bio *bio)
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
set_bit(R1BIO_WriteError, &r1_bio->state);
|
||||
else {
|
||||
/* Fail the request */
|
||||
set_bit(R1BIO_Degraded, &r1_bio->state);
|
||||
/* Finished with this branch */
|
||||
r1_bio->bios[mirror] = NULL;
|
||||
to_put = bio;
|
||||
@ -1535,11 +1531,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
write_behind = true;
|
||||
|
||||
r1_bio->bios[i] = NULL;
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags)) {
|
||||
if (i < conf->raid_disks)
|
||||
set_bit(R1BIO_Degraded, &r1_bio->state);
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
}
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
if (test_bit(WriteErrorSeen, &rdev->flags)) {
|
||||
@ -1558,16 +1551,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
*/
|
||||
max_sectors = bad_sectors;
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
/* We don't set R1BIO_Degraded as that
|
||||
* only applies if the disk is
|
||||
* missing, so it might be re-added,
|
||||
* and we want to know to recover this
|
||||
* chunk.
|
||||
* In this case the device is here,
|
||||
* and the fact that this chunk is not
|
||||
* in-sync is recorded in the bad
|
||||
* block log
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
if (is_bad) {
|
||||
@ -1645,9 +1628,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
stats.behind_writes < max_write_behind)
|
||||
alloc_behind_master_bio(r1_bio, bio);
|
||||
|
||||
mddev->bitmap_ops->startwrite(
|
||||
mddev, r1_bio->sector, r1_bio->sectors,
|
||||
test_bit(R1BIO_BehindIO, &r1_bio->state));
|
||||
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
|
||||
mddev->bitmap_ops->start_behind_write(mddev);
|
||||
first_clone = 0;
|
||||
}
|
||||
|
||||
@ -2614,12 +2596,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
|
||||
* errors.
|
||||
*/
|
||||
fail = true;
|
||||
if (!narrow_write_error(r1_bio, m)) {
|
||||
if (!narrow_write_error(r1_bio, m))
|
||||
md_error(conf->mddev,
|
||||
conf->mirrors[m].rdev);
|
||||
/* an I/O failed, we can't clear the bitmap */
|
||||
set_bit(R1BIO_Degraded, &r1_bio->state);
|
||||
}
|
||||
rdev_dec_pending(conf->mirrors[m].rdev,
|
||||
conf->mddev);
|
||||
}
|
||||
@ -2710,8 +2690,6 @@ static void raid1d(struct md_thread *thread)
|
||||
list_del(&r1_bio->retry_list);
|
||||
idx = sector_to_idx(r1_bio->sector);
|
||||
atomic_dec(&conf->nr_queued[idx]);
|
||||
if (mddev->degraded)
|
||||
set_bit(R1BIO_Degraded, &r1_bio->state);
|
||||
if (test_bit(R1BIO_WriteError, &r1_bio->state))
|
||||
close_write(r1_bio);
|
||||
raid_end_bio_io(r1_bio);
|
||||
|
@ -188,7 +188,6 @@ struct r1bio {
|
||||
enum r1bio_state {
|
||||
R1BIO_Uptodate,
|
||||
R1BIO_IsSync,
|
||||
R1BIO_Degraded,
|
||||
R1BIO_BehindIO,
|
||||
/* Set ReadError on bios that experience a readerror so that
|
||||
* raid1d knows what to do with them.
|
||||
|
@ -428,10 +428,6 @@ static void close_write(struct r10bio *r10_bio)
|
||||
{
|
||||
struct mddev *mddev = r10_bio->mddev;
|
||||
|
||||
/* clear the bitmap if all writes complete successfully */
|
||||
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
|
||||
!test_bit(R10BIO_Degraded, &r10_bio->state),
|
||||
false);
|
||||
md_write_end(mddev);
|
||||
}
|
||||
|
||||
@ -501,7 +497,6 @@ static void raid10_end_write_request(struct bio *bio)
|
||||
set_bit(R10BIO_WriteError, &r10_bio->state);
|
||||
else {
|
||||
/* Fail the request */
|
||||
set_bit(R10BIO_Degraded, &r10_bio->state);
|
||||
r10_bio->devs[slot].bio = NULL;
|
||||
to_put = bio;
|
||||
dec_rdev = 1;
|
||||
@ -1438,10 +1433,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
r10_bio->devs[i].bio = NULL;
|
||||
r10_bio->devs[i].repl_bio = NULL;
|
||||
|
||||
if (!rdev && !rrdev) {
|
||||
set_bit(R10BIO_Degraded, &r10_bio->state);
|
||||
if (!rdev && !rrdev)
|
||||
continue;
|
||||
}
|
||||
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
|
||||
sector_t first_bad;
|
||||
sector_t dev_sector = r10_bio->devs[i].addr;
|
||||
@ -1458,14 +1451,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
* to other devices yet
|
||||
*/
|
||||
max_sectors = bad_sectors;
|
||||
/* We don't set R10BIO_Degraded as that
|
||||
* only applies if the disk is missing,
|
||||
* so it might be re-added, and we want to
|
||||
* know to recover this chunk.
|
||||
* In this case the device is here, and the
|
||||
* fact that this chunk is not in-sync is
|
||||
* recorded in the bad block log.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
if (is_bad) {
|
||||
@ -1519,8 +1504,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
md_account_bio(mddev, &bio);
|
||||
r10_bio->master_bio = bio;
|
||||
atomic_set(&r10_bio->remaining, 1);
|
||||
mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
|
||||
false);
|
||||
|
||||
for (i = 0; i < conf->copies; i++) {
|
||||
if (r10_bio->devs[i].bio)
|
||||
@ -2966,11 +2949,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
} else if (bio != NULL && bio->bi_status) {
|
||||
fail = true;
|
||||
if (!narrow_write_error(r10_bio, m)) {
|
||||
if (!narrow_write_error(r10_bio, m))
|
||||
md_error(conf->mddev, rdev);
|
||||
set_bit(R10BIO_Degraded,
|
||||
&r10_bio->state);
|
||||
}
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
bio = r10_bio->devs[m].repl_bio;
|
||||
@ -3029,8 +3009,6 @@ static void raid10d(struct md_thread *thread)
|
||||
r10_bio = list_first_entry(&tmp, struct r10bio,
|
||||
retry_list);
|
||||
list_del(&r10_bio->retry_list);
|
||||
if (mddev->degraded)
|
||||
set_bit(R10BIO_Degraded, &r10_bio->state);
|
||||
|
||||
if (test_bit(R10BIO_WriteError,
|
||||
&r10_bio->state))
|
||||
|
@ -161,7 +161,6 @@ enum r10bio_state {
|
||||
R10BIO_IsSync,
|
||||
R10BIO_IsRecover,
|
||||
R10BIO_IsReshape,
|
||||
R10BIO_Degraded,
|
||||
/* Set ReadError on bios that experience a read error
|
||||
* so that raid10d knows what to do with them.
|
||||
*/
|
||||
|
@ -313,10 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
|
||||
if (sh->dev[i].written) {
|
||||
set_bit(R5_UPTODATE, &sh->dev[i].flags);
|
||||
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
|
||||
conf->mddev->bitmap_ops->endwrite(conf->mddev,
|
||||
sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
!test_bit(STRIPE_DEGRADED, &sh->state),
|
||||
false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1023,10 +1019,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
|
||||
/* checksum is already calculated in last run */
|
||||
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
|
||||
continue;
|
||||
addr = kmap_atomic(sh->dev[i].page);
|
||||
addr = kmap_local_page(sh->dev[i].page);
|
||||
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
|
||||
addr, PAGE_SIZE);
|
||||
kunmap_atomic(addr);
|
||||
kunmap_local(addr);
|
||||
}
|
||||
parity_pages = 1 + !!(sh->qd_idx >= 0);
|
||||
data_pages = write_disks - parity_pages;
|
||||
@ -1979,9 +1975,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log,
|
||||
u32 checksum;
|
||||
|
||||
r5l_recovery_read_page(log, ctx, page, log_offset);
|
||||
addr = kmap_atomic(page);
|
||||
addr = kmap_local_page(page);
|
||||
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
|
||||
kunmap_atomic(addr);
|
||||
kunmap_local(addr);
|
||||
return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
|
||||
}
|
||||
|
||||
@ -2381,11 +2377,11 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
|
||||
payload->size = cpu_to_le32(BLOCK_SECTORS);
|
||||
payload->location = cpu_to_le64(
|
||||
raid5_compute_blocknr(sh, i, 0));
|
||||
addr = kmap_atomic(dev->page);
|
||||
addr = kmap_local_page(dev->page);
|
||||
payload->checksum[0] = cpu_to_le32(
|
||||
crc32c_le(log->uuid_checksum, addr,
|
||||
PAGE_SIZE));
|
||||
kunmap_atomic(addr);
|
||||
kunmap_local(addr);
|
||||
sync_page_io(log->rdev, write_pos, PAGE_SIZE,
|
||||
dev->page, REQ_OP_WRITE, false);
|
||||
write_pos = r5l_ring_add(log, write_pos,
|
||||
@ -2888,10 +2884,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
|
||||
|
||||
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
|
||||
continue;
|
||||
addr = kmap_atomic(sh->dev[i].page);
|
||||
addr = kmap_local_page(sh->dev[i].page);
|
||||
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
|
||||
addr, PAGE_SIZE);
|
||||
kunmap_atomic(addr);
|
||||
kunmap_local(addr);
|
||||
pages++;
|
||||
}
|
||||
WARN_ON(pages == 0);
|
||||
|
@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
|
||||
if (raid5_has_log(conf) || raid5_has_ppl(conf))
|
||||
return false;
|
||||
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
|
||||
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
|
||||
is_full_stripe_write(sh);
|
||||
is_full_stripe_write(sh);
|
||||
}
|
||||
|
||||
/* we only do back search */
|
||||
@ -1345,8 +1344,6 @@ again:
|
||||
submit_bio_noacct(rbi);
|
||||
}
|
||||
if (!rdev && !rrdev) {
|
||||
if (op_is_write(op))
|
||||
set_bit(STRIPE_DEGRADED, &sh->state);
|
||||
pr_debug("skip op %d on disc %d for sector %llu\n",
|
||||
bi->bi_opf, i, (unsigned long long)sh->sector);
|
||||
clear_bit(R5_LOCKED, &sh->dev[i].flags);
|
||||
@ -2884,7 +2881,6 @@ static void raid5_end_write_request(struct bio *bi)
|
||||
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
|
||||
} else {
|
||||
if (bi->bi_status) {
|
||||
set_bit(STRIPE_DEGRADED, &sh->state);
|
||||
set_bit(WriteErrorSeen, &rdev->flags);
|
||||
set_bit(R5_WriteError, &sh->dev[i].flags);
|
||||
if (!test_and_set_bit(WantReplacement, &rdev->flags))
|
||||
@ -3548,29 +3544,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
|
||||
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
|
||||
sh->dev[dd_idx].sector);
|
||||
|
||||
if (conf->mddev->bitmap && firstwrite) {
|
||||
/* Cannot hold spinlock over bitmap_startwrite,
|
||||
* but must ensure this isn't added to a batch until
|
||||
* we have added to the bitmap and set bm_seq.
|
||||
* So set STRIPE_BITMAP_PENDING to prevent
|
||||
* batching.
|
||||
* If multiple __add_stripe_bio() calls race here they
|
||||
* much all set STRIPE_BITMAP_PENDING. So only the first one
|
||||
* to complete "bitmap_startwrite" gets to set
|
||||
* STRIPE_BIT_DELAY. This is important as once a stripe
|
||||
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
|
||||
* any more.
|
||||
*/
|
||||
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), false);
|
||||
spin_lock_irq(&sh->stripe_lock);
|
||||
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
|
||||
if (!sh->batch_head) {
|
||||
sh->bm_seq = conf->seq_flush+1;
|
||||
set_bit(STRIPE_BIT_DELAY, &sh->state);
|
||||
}
|
||||
if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
|
||||
sh->bm_seq = conf->seq_flush+1;
|
||||
set_bit(STRIPE_BIT_DELAY, &sh->state);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3621,7 +3597,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
BUG_ON(sh->batch_head);
|
||||
for (i = disks; i--; ) {
|
||||
struct bio *bi;
|
||||
int bitmap_end = 0;
|
||||
|
||||
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
|
||||
struct md_rdev *rdev = conf->disks[i].rdev;
|
||||
@ -3646,8 +3621,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
sh->dev[i].towrite = NULL;
|
||||
sh->overwrite_disks = 0;
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
if (bi)
|
||||
bitmap_end = 1;
|
||||
|
||||
log_stripe_write_finished(sh);
|
||||
|
||||
@ -3662,11 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
bio_io_error(bi);
|
||||
bi = nextbi;
|
||||
}
|
||||
if (bitmap_end)
|
||||
conf->mddev->bitmap_ops->endwrite(conf->mddev,
|
||||
sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
false, false);
|
||||
bitmap_end = 0;
|
||||
/* and fail all 'written' */
|
||||
bi = sh->dev[i].written;
|
||||
sh->dev[i].written = NULL;
|
||||
@ -3675,7 +3643,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
sh->dev[i].page = sh->dev[i].orig_page;
|
||||
}
|
||||
|
||||
if (bi) bitmap_end = 1;
|
||||
while (bi && bi->bi_iter.bi_sector <
|
||||
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
|
||||
@ -3709,10 +3676,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
bi = nextbi;
|
||||
}
|
||||
}
|
||||
if (bitmap_end)
|
||||
conf->mddev->bitmap_ops->endwrite(conf->mddev,
|
||||
sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
false, false);
|
||||
/* If we were in the middle of a write the parity block might
|
||||
* still be locked - so just clear all R5_LOCKED flags
|
||||
*/
|
||||
@ -4061,10 +4024,7 @@ returnbi:
|
||||
bio_endio(wbi);
|
||||
wbi = wbi2;
|
||||
}
|
||||
conf->mddev->bitmap_ops->endwrite(conf->mddev,
|
||||
sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
!test_bit(STRIPE_DEGRADED, &sh->state),
|
||||
false);
|
||||
|
||||
if (head_sh->batch_head) {
|
||||
sh = list_first_entry(&sh->batch_list,
|
||||
struct stripe_head,
|
||||
@ -4341,7 +4301,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
|
||||
s->locked++;
|
||||
set_bit(R5_Wantwrite, &dev->flags);
|
||||
|
||||
clear_bit(STRIPE_DEGRADED, &sh->state);
|
||||
set_bit(STRIPE_INSYNC, &sh->state);
|
||||
break;
|
||||
case check_state_run:
|
||||
@ -4498,7 +4457,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
|
||||
clear_bit(R5_Wantwrite, &dev->flags);
|
||||
s->locked--;
|
||||
}
|
||||
clear_bit(STRIPE_DEGRADED, &sh->state);
|
||||
|
||||
set_bit(STRIPE_INSYNC, &sh->state);
|
||||
break;
|
||||
@ -4891,8 +4849,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
|
||||
(1 << STRIPE_COMPUTE_RUN) |
|
||||
(1 << STRIPE_DISCARD) |
|
||||
(1 << STRIPE_BATCH_READY) |
|
||||
(1 << STRIPE_BATCH_ERR) |
|
||||
(1 << STRIPE_BITMAP_PENDING)),
|
||||
(1 << STRIPE_BATCH_ERR)),
|
||||
"stripe state: %lx\n", sh->state);
|
||||
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
|
||||
(1 << STRIPE_REPLACED)),
|
||||
@ -4900,7 +4857,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
|
||||
|
||||
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
|
||||
(1 << STRIPE_PREREAD_ACTIVE) |
|
||||
(1 << STRIPE_DEGRADED) |
|
||||
(1 << STRIPE_ON_UNPLUG_LIST)),
|
||||
head_sh->state & (1 << STRIPE_INSYNC));
|
||||
|
||||
@ -5784,10 +5740,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
}
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
if (conf->mddev->bitmap) {
|
||||
for (d = 0; d < conf->raid_disks - conf->max_degraded;
|
||||
d++)
|
||||
mddev->bitmap_ops->startwrite(mddev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), false);
|
||||
sh->bm_seq = conf->seq_flush + 1;
|
||||
set_bit(STRIPE_BIT_DELAY, &sh->state);
|
||||
}
|
||||
@ -5928,6 +5880,54 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev,
|
||||
return LOC_BEHIND_RESHAPE;
|
||||
}
|
||||
|
||||
static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
|
||||
unsigned long *sectors)
|
||||
{
|
||||
struct r5conf *conf = mddev->private;
|
||||
sector_t start = *offset;
|
||||
sector_t end = start + *sectors;
|
||||
sector_t prev_start = start;
|
||||
sector_t prev_end = end;
|
||||
int sectors_per_chunk;
|
||||
enum reshape_loc loc;
|
||||
int dd_idx;
|
||||
|
||||
sectors_per_chunk = conf->chunk_sectors *
|
||||
(conf->raid_disks - conf->max_degraded);
|
||||
start = round_down(start, sectors_per_chunk);
|
||||
end = round_up(end, sectors_per_chunk);
|
||||
|
||||
start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
|
||||
end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL);
|
||||
|
||||
/*
|
||||
* For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make
|
||||
* progress, hence it's the same as LOC_BEHIND_RESHAPE.
|
||||
*/
|
||||
loc = get_reshape_loc(mddev, conf, prev_start);
|
||||
if (likely(loc != LOC_AHEAD_OF_RESHAPE)) {
|
||||
*offset = start;
|
||||
*sectors = end - start;
|
||||
return;
|
||||
}
|
||||
|
||||
sectors_per_chunk = conf->prev_chunk_sectors *
|
||||
(conf->previous_raid_disks - conf->max_degraded);
|
||||
prev_start = round_down(prev_start, sectors_per_chunk);
|
||||
prev_end = round_down(prev_end, sectors_per_chunk);
|
||||
|
||||
prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL);
|
||||
prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL);
|
||||
|
||||
/*
|
||||
* for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO
|
||||
* is handled in make_stripe_request(), we can't know this here hence
|
||||
* we set bits for both.
|
||||
*/
|
||||
*offset = min(start, prev_start);
|
||||
*sectors = max(end, prev_end) - *offset;
|
||||
}
|
||||
|
||||
static enum stripe_result make_stripe_request(struct mddev *mddev,
|
||||
struct r5conf *conf, struct stripe_request_ctx *ctx,
|
||||
sector_t logical_sector, struct bio *bi)
|
||||
@ -8976,6 +8976,7 @@ static struct md_personality raid6_personality =
|
||||
.takeover = raid6_takeover,
|
||||
.change_consistency_policy = raid5_change_consistency_policy,
|
||||
.prepare_suspend = raid5_prepare_suspend,
|
||||
.bitmap_sector = raid5_bitmap_sector,
|
||||
};
|
||||
static struct md_personality raid5_personality =
|
||||
{
|
||||
@ -9001,6 +9002,7 @@ static struct md_personality raid5_personality =
|
||||
.takeover = raid5_takeover,
|
||||
.change_consistency_policy = raid5_change_consistency_policy,
|
||||
.prepare_suspend = raid5_prepare_suspend,
|
||||
.bitmap_sector = raid5_bitmap_sector,
|
||||
};
|
||||
|
||||
static struct md_personality raid4_personality =
|
||||
@ -9027,6 +9029,7 @@ static struct md_personality raid4_personality =
|
||||
.takeover = raid4_takeover,
|
||||
.change_consistency_policy = raid5_change_consistency_policy,
|
||||
.prepare_suspend = raid5_prepare_suspend,
|
||||
.bitmap_sector = raid5_bitmap_sector,
|
||||
};
|
||||
|
||||
static int __init raid5_init(void)
|
||||
|
@ -358,7 +358,6 @@ enum {
|
||||
STRIPE_REPLACED,
|
||||
STRIPE_PREREAD_ACTIVE,
|
||||
STRIPE_DELAYED,
|
||||
STRIPE_DEGRADED,
|
||||
STRIPE_BIT_DELAY,
|
||||
STRIPE_EXPANDING,
|
||||
STRIPE_EXPAND_SOURCE,
|
||||
@ -372,9 +371,6 @@ enum {
|
||||
STRIPE_ON_RELEASE_LIST,
|
||||
STRIPE_BATCH_READY,
|
||||
STRIPE_BATCH_ERR,
|
||||
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
|
||||
* to batch yet.
|
||||
*/
|
||||
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
|
||||
* this bit is used in two scenarios:
|
||||
*
|
||||
|
@ -2094,8 +2094,7 @@ static int msb_init_disk(struct memstick_dev *card)
|
||||
if (msb->disk_id < 0)
|
||||
return msb->disk_id;
|
||||
|
||||
rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &msb_mq_ops, 2,
|
||||
BLK_MQ_F_SHOULD_MERGE);
|
||||
rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &msb_mq_ops, 2, 0);
|
||||
if (rc)
|
||||
goto out_release_id;
|
||||
|
||||
|
@ -1139,8 +1139,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
|
||||
if (disk_id < 0)
|
||||
return disk_id;
|
||||
|
||||
rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &mspro_mq_ops, 2,
|
||||
BLK_MQ_F_SHOULD_MERGE);
|
||||
rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &mspro_mq_ops, 2, 0);
|
||||
if (rc)
|
||||
goto out_release_id;
|
||||
|
||||
|
@ -441,7 +441,7 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
|
||||
else
|
||||
mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
|
||||
mq->tag_set.numa_node = NUMA_NO_NODE;
|
||||
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
|
||||
mq->tag_set.flags = BLK_MQ_F_BLOCKING;
|
||||
mq->tag_set.nr_hw_queues = 1;
|
||||
mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
|
||||
mq->tag_set.driver_data = mq;
|
||||
|
@ -329,7 +329,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
|
||||
goto out_list_del;
|
||||
|
||||
ret = blk_mq_alloc_sq_tag_set(new->tag_set, &mtd_mq_ops, 2,
|
||||
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
|
||||
BLK_MQ_F_BLOCKING);
|
||||
if (ret)
|
||||
goto out_kfree_tag_set;
|
||||
|
||||
|
@ -383,7 +383,7 @@ int ubiblock_create(struct ubi_volume_info *vi)
|
||||
dev->tag_set.ops = &ubiblock_mq_ops;
|
||||
dev->tag_set.queue_depth = 64;
|
||||
dev->tag_set.numa_node = NUMA_NO_NODE;
|
||||
dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
|
||||
dev->tag_set.flags = BLK_MQ_F_BLOCKING;
|
||||
dev->tag_set.cmd_size = sizeof(struct ubiblock_pdu);
|
||||
dev->tag_set.driver_data = dev;
|
||||
dev->tag_set.nr_hw_queues = 1;
|
||||
|
@ -1251,7 +1251,6 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
|
||||
anv->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
|
||||
anv->admin_tagset.numa_node = NUMA_NO_NODE;
|
||||
anv->admin_tagset.cmd_size = sizeof(struct apple_nvme_iod);
|
||||
anv->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
|
||||
anv->admin_tagset.driver_data = &anv->adminq;
|
||||
|
||||
ret = blk_mq_alloc_tag_set(&anv->admin_tagset);
|
||||
@ -1275,7 +1274,6 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
|
||||
anv->tagset.timeout = NVME_IO_TIMEOUT;
|
||||
anv->tagset.numa_node = NUMA_NO_NODE;
|
||||
anv->tagset.cmd_size = sizeof(struct apple_nvme_iod);
|
||||
anv->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
anv->tagset.driver_data = &anv->ioq;
|
||||
|
||||
ret = blk_mq_alloc_tag_set(&anv->tagset);
|
||||
|
@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
|
||||
{
|
||||
cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
|
||||
cmnd->rw.lbatm = cpu_to_le16(0xffff);
|
||||
}
|
||||
|
||||
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
|
||||
struct request *req)
|
||||
{
|
||||
@ -1017,18 +1023,17 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
control |= NVME_RW_PRINFO_PRACT;
|
||||
}
|
||||
|
||||
switch (ns->head->pi_type) {
|
||||
case NVME_NS_DPS_PI_TYPE3:
|
||||
if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
|
||||
control |= NVME_RW_PRINFO_PRCHK_GUARD;
|
||||
break;
|
||||
case NVME_NS_DPS_PI_TYPE1:
|
||||
case NVME_NS_DPS_PI_TYPE2:
|
||||
control |= NVME_RW_PRINFO_PRCHK_GUARD |
|
||||
NVME_RW_PRINFO_PRCHK_REF;
|
||||
if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
|
||||
control |= NVME_RW_PRINFO_PRCHK_REF;
|
||||
if (op == nvme_cmd_zone_append)
|
||||
control |= NVME_RW_APPEND_PIREMAP;
|
||||
nvme_set_ref_tag(ns, cmnd, req);
|
||||
break;
|
||||
}
|
||||
if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
|
||||
control |= NVME_RW_PRINFO_PRCHK_APP;
|
||||
nvme_set_app_tag(req, cmnd);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2128,9 +2133,10 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
|
||||
struct queue_limits lim;
|
||||
int ret;
|
||||
|
||||
blk_mq_freeze_queue(ns->disk->queue);
|
||||
lim = queue_limits_start_update(ns->disk->queue);
|
||||
nvme_set_ctrl_limits(ns->ctrl, &lim);
|
||||
|
||||
blk_mq_freeze_queue(ns->disk->queue);
|
||||
ret = queue_limits_commit_update(ns->disk->queue, &lim);
|
||||
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
|
||||
blk_mq_unfreeze_queue(ns->disk->queue);
|
||||
@ -2177,12 +2183,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
||||
goto out;
|
||||
}
|
||||
|
||||
lim = queue_limits_start_update(ns->disk->queue);
|
||||
|
||||
blk_mq_freeze_queue(ns->disk->queue);
|
||||
ns->head->lba_shift = id->lbaf[lbaf].ds;
|
||||
ns->head->nuse = le64_to_cpu(id->nuse);
|
||||
capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
|
||||
|
||||
lim = queue_limits_start_update(ns->disk->queue);
|
||||
nvme_set_ctrl_limits(ns->ctrl, &lim);
|
||||
nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
|
||||
nvme_set_chunk_sectors(ns, id, &lim);
|
||||
@ -2285,6 +2291,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
|
||||
struct queue_limits *ns_lim = &ns->disk->queue->limits;
|
||||
struct queue_limits lim;
|
||||
|
||||
lim = queue_limits_start_update(ns->head->disk->queue);
|
||||
blk_mq_freeze_queue(ns->head->disk->queue);
|
||||
/*
|
||||
* queue_limits mixes values that are the hardware limitations
|
||||
@ -2301,7 +2308,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
|
||||
* the splitting limits in to make sure we still obey possibly
|
||||
* lower limitations of other controllers.
|
||||
*/
|
||||
lim = queue_limits_start_update(ns->head->disk->queue);
|
||||
lim.logical_block_size = ns_lim->logical_block_size;
|
||||
lim.physical_block_size = ns_lim->physical_block_size;
|
||||
lim.io_min = ns_lim->io_min;
|
||||
@ -3092,7 +3098,7 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
||||
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
|
||||
struct nvme_effects_log **log)
|
||||
{
|
||||
struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
|
||||
struct nvme_effects_log *old, *cel = xa_load(&ctrl->cels, csi);
|
||||
int ret;
|
||||
|
||||
if (cel)
|
||||
@ -3109,7 +3115,11 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
|
||||
return ret;
|
||||
}
|
||||
|
||||
xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
|
||||
old = xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
|
||||
if (xa_is_err(old)) {
|
||||
kfree(cel);
|
||||
return xa_err(old);
|
||||
}
|
||||
out:
|
||||
*log = cel;
|
||||
return 0;
|
||||
@ -3171,6 +3181,25 @@ free_data:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_init_effects_log(struct nvme_ctrl *ctrl,
|
||||
u8 csi, struct nvme_effects_log **log)
|
||||
{
|
||||
struct nvme_effects_log *effects, *old;
|
||||
|
||||
effects = kzalloc(sizeof(*effects), GFP_KERNEL);
|
||||
if (!effects)
|
||||
return -ENOMEM;
|
||||
|
||||
old = xa_store(&ctrl->cels, csi, effects, GFP_KERNEL);
|
||||
if (xa_is_err(old)) {
|
||||
kfree(effects);
|
||||
return xa_err(old);
|
||||
}
|
||||
|
||||
*log = effects;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_effects_log *log = ctrl->effects;
|
||||
@ -3217,10 +3246,9 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
||||
}
|
||||
|
||||
if (!ctrl->effects) {
|
||||
ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
|
||||
if (!ctrl->effects)
|
||||
return -ENOMEM;
|
||||
xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
|
||||
ret = nvme_init_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
nvme_init_known_nvm_effects(ctrl);
|
||||
@ -4564,7 +4592,6 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
/* Reserved for fabric connect and keep alive */
|
||||
set->reserved_tags = 2;
|
||||
set->numa_node = ctrl->numa_node;
|
||||
set->flags = BLK_MQ_F_NO_SCHED;
|
||||
if (ctrl->ops->flags & NVME_F_BLOCKING)
|
||||
set->flags |= BLK_MQ_F_BLOCKING;
|
||||
set->cmd_size = cmd_size;
|
||||
@ -4639,7 +4666,6 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
/* Reserved for fabric connect */
|
||||
set->reserved_tags = 1;
|
||||
set->numa_node = ctrl->numa_node;
|
||||
set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
if (ctrl->ops->flags & NVME_F_BLOCKING)
|
||||
set->flags |= BLK_MQ_F_BLOCKING;
|
||||
set->cmd_size = cmd_size;
|
||||
|
@ -16,7 +16,6 @@
|
||||
#include <linux/nvme-fc.h>
|
||||
#include "fc.h"
|
||||
#include <scsi/scsi_transport_fc.h>
|
||||
#include <linux/blk-mq-pci.h>
|
||||
|
||||
/* *************************** Data Structures/Defines ****************** */
|
||||
|
||||
|
@ -1187,43 +1187,4 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
|
||||
return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVME_VERBOSE_ERRORS
|
||||
const char *nvme_get_error_status_str(u16 status);
|
||||
const char *nvme_get_opcode_str(u8 opcode);
|
||||
const char *nvme_get_admin_opcode_str(u8 opcode);
|
||||
const char *nvme_get_fabrics_opcode_str(u8 opcode);
|
||||
#else /* CONFIG_NVME_VERBOSE_ERRORS */
|
||||
static inline const char *nvme_get_error_status_str(u16 status)
|
||||
{
|
||||
return "I/O Error";
|
||||
}
|
||||
static inline const char *nvme_get_opcode_str(u8 opcode)
|
||||
{
|
||||
return "I/O Cmd";
|
||||
}
|
||||
static inline const char *nvme_get_admin_opcode_str(u8 opcode)
|
||||
{
|
||||
return "Admin Cmd";
|
||||
}
|
||||
|
||||
static inline const char *nvme_get_fabrics_opcode_str(u8 opcode)
|
||||
{
|
||||
return "Fabrics Cmd";
|
||||
}
|
||||
#endif /* CONFIG_NVME_VERBOSE_ERRORS */
|
||||
|
||||
static inline const char *nvme_opcode_str(int qid, u8 opcode)
|
||||
{
|
||||
return qid ? nvme_get_opcode_str(opcode) :
|
||||
nvme_get_admin_opcode_str(opcode);
|
||||
}
|
||||
|
||||
static inline const char *nvme_fabrics_opcode_str(
|
||||
int qid, const struct nvme_command *cmd)
|
||||
{
|
||||
if (nvme_is_fabrics(cmd))
|
||||
return nvme_get_fabrics_opcode_str(cmd->fabrics.fctype);
|
||||
|
||||
return nvme_opcode_str(qid, cmd->common.opcode);
|
||||
}
|
||||
#endif /* _NVME_H */
|
||||
|
@ -8,7 +8,6 @@
|
||||
#include <linux/async.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/blk-mq-pci.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/dmi.h>
|
||||
#include <linux/init.h>
|
||||
@ -373,7 +372,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
|
||||
/*
|
||||
* Ensure that the doorbell is updated before reading the event
|
||||
* index from memory. The controller needs to provide similar
|
||||
* ordering to ensure the envent index is updated before reading
|
||||
* ordering to ensure the event index is updated before reading
|
||||
* the doorbell.
|
||||
*/
|
||||
mb();
|
||||
@ -463,7 +462,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
|
||||
*/
|
||||
map->queue_offset = qoff;
|
||||
if (i != HCTX_TYPE_POLL && offset)
|
||||
blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
|
||||
blk_mq_map_hw_queues(map, dev->dev, offset);
|
||||
else
|
||||
blk_mq_map_queues(map);
|
||||
qoff += map->nr_queues;
|
||||
@ -1148,13 +1147,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
|
||||
}
|
||||
}
|
||||
|
||||
static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
|
||||
struct io_comp_batch *iob)
|
||||
static inline bool nvme_poll_cq(struct nvme_queue *nvmeq,
|
||||
struct io_comp_batch *iob)
|
||||
{
|
||||
int found = 0;
|
||||
bool found = false;
|
||||
|
||||
while (nvme_cqe_pending(nvmeq)) {
|
||||
found++;
|
||||
found = true;
|
||||
/*
|
||||
* load-load control dependency between phase and the rest of
|
||||
* the cqe requires a full read memory barrier
|
||||
@ -2086,8 +2085,8 @@ static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
|
||||
sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
|
||||
GFP_KERNEL);
|
||||
if (!dev->host_mem_descs) {
|
||||
dma_free_noncontiguous(dev->dev, dev->host_mem_size,
|
||||
dev->hmb_sgt, DMA_BIDIRECTIONAL);
|
||||
dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt,
|
||||
DMA_BIDIRECTIONAL);
|
||||
dev->hmb_sgt = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -54,6 +54,8 @@ MODULE_PARM_DESC(tls_handshake_timeout,
|
||||
"nvme TLS handshake timeout in seconds (default 10)");
|
||||
#endif
|
||||
|
||||
static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
/* lockdep can detect a circular dependency of the form
|
||||
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
|
||||
@ -127,6 +129,7 @@ enum nvme_tcp_queue_flags {
|
||||
NVME_TCP_Q_ALLOCATED = 0,
|
||||
NVME_TCP_Q_LIVE = 1,
|
||||
NVME_TCP_Q_POLLING = 2,
|
||||
NVME_TCP_Q_IO_CPU_SET = 3,
|
||||
};
|
||||
|
||||
enum nvme_tcp_recv_state {
|
||||
@ -1562,23 +1565,56 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
|
||||
ctrl->io_queues[HCTX_TYPE_POLL];
|
||||
}
|
||||
|
||||
/**
|
||||
* Track the number of queues assigned to each cpu using a global per-cpu
|
||||
* counter and select the least used cpu from the mq_map. Our goal is to spread
|
||||
* different controllers I/O threads across different cpu cores.
|
||||
*
|
||||
* Note that the accounting is not 100% perfect, but we don't need to be, we're
|
||||
* simply putting our best effort to select the best candidate cpu core that we
|
||||
* find at any given point.
|
||||
*/
|
||||
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
|
||||
{
|
||||
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
|
||||
int qid = nvme_tcp_queue_id(queue);
|
||||
int n = 0;
|
||||
struct blk_mq_tag_set *set = &ctrl->tag_set;
|
||||
int qid = nvme_tcp_queue_id(queue) - 1;
|
||||
unsigned int *mq_map = NULL;
|
||||
int cpu, min_queues = INT_MAX, io_cpu;
|
||||
|
||||
if (wq_unbound)
|
||||
goto out;
|
||||
|
||||
if (nvme_tcp_default_queue(queue))
|
||||
n = qid - 1;
|
||||
mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
|
||||
else if (nvme_tcp_read_queue(queue))
|
||||
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
|
||||
mq_map = set->map[HCTX_TYPE_READ].mq_map;
|
||||
else if (nvme_tcp_poll_queue(queue))
|
||||
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
|
||||
ctrl->io_queues[HCTX_TYPE_READ] - 1;
|
||||
if (wq_unbound)
|
||||
queue->io_cpu = WORK_CPU_UNBOUND;
|
||||
else
|
||||
queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
|
||||
mq_map = set->map[HCTX_TYPE_POLL].mq_map;
|
||||
|
||||
if (WARN_ON(!mq_map))
|
||||
goto out;
|
||||
|
||||
/* Search for the least used cpu from the mq_map */
|
||||
io_cpu = WORK_CPU_UNBOUND;
|
||||
for_each_online_cpu(cpu) {
|
||||
int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
|
||||
|
||||
if (mq_map[cpu] != qid)
|
||||
continue;
|
||||
if (num_queues < min_queues) {
|
||||
io_cpu = cpu;
|
||||
min_queues = num_queues;
|
||||
}
|
||||
}
|
||||
if (io_cpu != WORK_CPU_UNBOUND) {
|
||||
queue->io_cpu = io_cpu;
|
||||
atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
|
||||
set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags);
|
||||
}
|
||||
out:
|
||||
dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
|
||||
qid, queue->io_cpu);
|
||||
}
|
||||
|
||||
static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
|
||||
@ -1722,7 +1758,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
|
||||
|
||||
queue->sock->sk->sk_allocation = GFP_ATOMIC;
|
||||
queue->sock->sk->sk_use_task_frag = false;
|
||||
nvme_tcp_set_queue_io_cpu(queue);
|
||||
queue->io_cpu = WORK_CPU_UNBOUND;
|
||||
queue->request = NULL;
|
||||
queue->data_remaining = 0;
|
||||
queue->ddgst_remaining = 0;
|
||||
@ -1844,6 +1880,9 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
|
||||
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
|
||||
return;
|
||||
|
||||
if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags))
|
||||
atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
|
||||
|
||||
mutex_lock(&queue->queue_lock);
|
||||
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
|
||||
__nvme_tcp_stop_queue(queue);
|
||||
@ -1878,9 +1917,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
|
||||
nvme_tcp_init_recv_ctx(queue);
|
||||
nvme_tcp_setup_sock_ops(queue);
|
||||
|
||||
if (idx)
|
||||
if (idx) {
|
||||
nvme_tcp_set_queue_io_cpu(queue);
|
||||
ret = nvmf_connect_io_queue(nctrl, idx);
|
||||
else
|
||||
} else
|
||||
ret = nvmf_connect_admin_queue(nctrl);
|
||||
|
||||
if (!ret) {
|
||||
@ -2845,6 +2885,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
|
||||
static int __init nvme_tcp_init_module(void)
|
||||
{
|
||||
unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
|
||||
int cpu;
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
|
||||
@ -2862,6 +2903,9 @@ static int __init nvme_tcp_init_module(void)
|
||||
if (!nvme_tcp_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
|
||||
|
||||
nvmf_register_transport(&nvme_tcp_transport);
|
||||
return 0;
|
||||
}
|
||||
|
@ -115,3 +115,14 @@ config NVME_TARGET_AUTH
|
||||
target side.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NVME_TARGET_PCI_EPF
|
||||
tristate "NVMe PCI Endpoint Function target support"
|
||||
depends on NVME_TARGET && PCI_ENDPOINT
|
||||
depends on NVME_CORE=y || NVME_CORE=NVME_TARGET
|
||||
help
|
||||
This enables the NVMe PCI Endpoint Function target driver support,
|
||||
which allows creating a NVMe PCI controller using an endpoint mode
|
||||
capable PCI controller.
|
||||
|
||||
If unsure, say N.
|
||||
|
@ -8,6 +8,7 @@ obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
|
||||
obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o
|
||||
obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o
|
||||
obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
|
||||
obj-$(CONFIG_NVME_TARGET_PCI_EPF) += nvmet-pci-epf.o
|
||||
|
||||
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
|
||||
discovery.o io-cmd-file.o io-cmd-bdev.o pr.o
|
||||
@ -20,4 +21,5 @@ nvmet-rdma-y += rdma.o
|
||||
nvmet-fc-y += fc.o
|
||||
nvme-fcloop-y += fcloop.o
|
||||
nvmet-tcp-y += tcp.o
|
||||
nvmet-pci-epf-y += pci-epf.o
|
||||
nvmet-$(CONFIG_TRACING) += trace.o
|
||||
|
@ -12,6 +12,142 @@
|
||||
#include <linux/unaligned.h>
|
||||
#include "nvmet.h"
|
||||
|
||||
static void nvmet_execute_delete_sq(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u16 sqid = le16_to_cpu(req->cmd->delete_queue.qid);
|
||||
u16 status;
|
||||
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
status = nvmet_report_invalid_opcode(req);
|
||||
goto complete;
|
||||
}
|
||||
|
||||
if (!sqid) {
|
||||
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = nvmet_check_sqid(ctrl, sqid, false);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
status = ctrl->ops->delete_sq(ctrl, sqid);
|
||||
|
||||
complete:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
static void nvmet_execute_create_sq(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
u16 sqid = le16_to_cpu(cmd->create_sq.sqid);
|
||||
u16 cqid = le16_to_cpu(cmd->create_sq.cqid);
|
||||
u16 sq_flags = le16_to_cpu(cmd->create_sq.sq_flags);
|
||||
u16 qsize = le16_to_cpu(cmd->create_sq.qsize);
|
||||
u64 prp1 = le64_to_cpu(cmd->create_sq.prp1);
|
||||
u16 status;
|
||||
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
status = nvmet_report_invalid_opcode(req);
|
||||
goto complete;
|
||||
}
|
||||
|
||||
if (!sqid) {
|
||||
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = nvmet_check_sqid(ctrl, sqid, true);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
/*
|
||||
* Note: The NVMe specification allows multiple SQs to use the same CQ.
|
||||
* However, the target code does not really support that. So for now,
|
||||
* prevent this and fail the command if sqid and cqid are different.
|
||||
*/
|
||||
if (!cqid || cqid != sqid) {
|
||||
pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid);
|
||||
status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
if (!qsize || qsize > NVME_CAP_MQES(ctrl->cap)) {
|
||||
status = NVME_SC_QUEUE_SIZE | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1);
|
||||
|
||||
complete:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
static void nvmet_execute_delete_cq(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u16 cqid = le16_to_cpu(req->cmd->delete_queue.qid);
|
||||
u16 status;
|
||||
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
status = nvmet_report_invalid_opcode(req);
|
||||
goto complete;
|
||||
}
|
||||
|
||||
if (!cqid) {
|
||||
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = nvmet_check_cqid(ctrl, cqid);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
status = ctrl->ops->delete_cq(ctrl, cqid);
|
||||
|
||||
complete:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
static void nvmet_execute_create_cq(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
u16 cqid = le16_to_cpu(cmd->create_cq.cqid);
|
||||
u16 cq_flags = le16_to_cpu(cmd->create_cq.cq_flags);
|
||||
u16 qsize = le16_to_cpu(cmd->create_cq.qsize);
|
||||
u16 irq_vector = le16_to_cpu(cmd->create_cq.irq_vector);
|
||||
u64 prp1 = le64_to_cpu(cmd->create_cq.prp1);
|
||||
u16 status;
|
||||
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
status = nvmet_report_invalid_opcode(req);
|
||||
goto complete;
|
||||
}
|
||||
|
||||
if (!cqid) {
|
||||
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = nvmet_check_cqid(ctrl, cqid);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
if (!qsize || qsize > NVME_CAP_MQES(ctrl->cap)) {
|
||||
status = NVME_SC_QUEUE_SIZE | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = ctrl->ops->create_cq(ctrl, cqid, cq_flags, qsize,
|
||||
prp1, irq_vector);
|
||||
|
||||
complete:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
u32 nvmet_get_log_page_len(struct nvme_command *cmd)
|
||||
{
|
||||
u32 len = le16_to_cpu(cmd->get_log_page.numdu);
|
||||
@ -230,8 +366,18 @@ out:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
|
||||
static void nvmet_get_cmd_effects_admin(struct nvmet_ctrl *ctrl,
|
||||
struct nvme_effects_log *log)
|
||||
{
|
||||
/* For a PCI target controller, advertize support for the . */
|
||||
if (nvmet_is_pci_ctrl(ctrl)) {
|
||||
log->acs[nvme_admin_delete_sq] =
|
||||
log->acs[nvme_admin_create_sq] =
|
||||
log->acs[nvme_admin_delete_cq] =
|
||||
log->acs[nvme_admin_create_cq] =
|
||||
cpu_to_le32(NVME_CMD_EFFECTS_CSUPP);
|
||||
}
|
||||
|
||||
log->acs[nvme_admin_get_log_page] =
|
||||
log->acs[nvme_admin_identify] =
|
||||
log->acs[nvme_admin_abort_cmd] =
|
||||
@ -240,7 +386,10 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
|
||||
log->acs[nvme_admin_async_event] =
|
||||
log->acs[nvme_admin_keep_alive] =
|
||||
cpu_to_le32(NVME_CMD_EFFECTS_CSUPP);
|
||||
}
|
||||
|
||||
static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
|
||||
{
|
||||
log->iocs[nvme_cmd_read] =
|
||||
log->iocs[nvme_cmd_flush] =
|
||||
log->iocs[nvme_cmd_dsm] =
|
||||
@ -265,6 +414,7 @@ static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log)
|
||||
|
||||
static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvme_effects_log *log;
|
||||
u16 status = NVME_SC_SUCCESS;
|
||||
|
||||
@ -276,6 +426,7 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
|
||||
|
||||
switch (req->cmd->get_log_page.csi) {
|
||||
case NVME_CSI_NVM:
|
||||
nvmet_get_cmd_effects_admin(ctrl, log);
|
||||
nvmet_get_cmd_effects_nvm(log);
|
||||
break;
|
||||
case NVME_CSI_ZNS:
|
||||
@ -283,6 +434,7 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
|
||||
status = NVME_SC_INVALID_IO_CMD_SET;
|
||||
goto free;
|
||||
}
|
||||
nvmet_get_cmd_effects_admin(ctrl, log);
|
||||
nvmet_get_cmd_effects_nvm(log);
|
||||
nvmet_get_cmd_effects_zns(log);
|
||||
break;
|
||||
@ -508,7 +660,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvmet_subsys *subsys = ctrl->subsys;
|
||||
struct nvme_id_ctrl *id;
|
||||
u32 cmd_capsule_size;
|
||||
u32 cmd_capsule_size, ctratt;
|
||||
u16 status = 0;
|
||||
|
||||
if (!subsys->subsys_discovered) {
|
||||
@ -523,9 +675,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* XXX: figure out how to assign real vendors IDs. */
|
||||
id->vid = 0;
|
||||
id->ssvid = 0;
|
||||
id->vid = cpu_to_le16(subsys->vendor_id);
|
||||
id->ssvid = cpu_to_le16(subsys->subsys_vendor_id);
|
||||
|
||||
memcpy(id->sn, ctrl->subsys->serial, NVMET_SN_MAX_SIZE);
|
||||
memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number,
|
||||
@ -557,8 +708,10 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
|
||||
|
||||
/* XXX: figure out what to do about RTD3R/RTD3 */
|
||||
id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
|
||||
id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT |
|
||||
NVME_CTRL_ATTR_TBKAS);
|
||||
ctratt = NVME_CTRL_ATTR_HID_128_BIT | NVME_CTRL_ATTR_TBKAS;
|
||||
if (nvmet_is_pci_ctrl(ctrl))
|
||||
ctratt |= NVME_CTRL_ATTR_RHII;
|
||||
id->ctratt = cpu_to_le32(ctratt);
|
||||
|
||||
id->oacs = 0;
|
||||
|
||||
@ -1105,6 +1258,92 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u16 nvmet_set_feat_host_id(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
|
||||
if (!nvmet_is_pci_ctrl(ctrl))
|
||||
return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR;
|
||||
|
||||
/*
|
||||
* The NVMe base specifications v2.1 recommends supporting 128-bits host
|
||||
* IDs (section 5.1.25.1.28.1). However, that same section also says
|
||||
* that "The controller may support a 64-bit Host Identifier and/or an
|
||||
* extended 128-bit Host Identifier". So simplify this support and do
|
||||
* not support 64-bits host IDs to avoid needing to check that all
|
||||
* controllers associated with the same subsystem all use the same host
|
||||
* ID size.
|
||||
*/
|
||||
if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw11);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
return nvmet_copy_from_sgl(req, 0, &req->sq->ctrl->hostid,
|
||||
sizeof(req->sq->ctrl->hostid));
|
||||
}
|
||||
|
||||
static u16 nvmet_set_feat_irq_coalesce(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
|
||||
struct nvmet_feat_irq_coalesce irqc = {
|
||||
.time = (cdw11 >> 8) & 0xff,
|
||||
.thr = cdw11 & 0xff,
|
||||
};
|
||||
|
||||
/*
|
||||
* This feature is not supported for fabrics controllers and mandatory
|
||||
* for PCI controllers.
|
||||
*/
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw10);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc);
|
||||
}
|
||||
|
||||
static u16 nvmet_set_feat_irq_config(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
|
||||
struct nvmet_feat_irq_config irqcfg = {
|
||||
.iv = cdw11 & 0xffff,
|
||||
.cd = (cdw11 >> 16) & 0x1,
|
||||
};
|
||||
|
||||
/*
|
||||
* This feature is not supported for fabrics controllers and mandatory
|
||||
* for PCI controllers.
|
||||
*/
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw10);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg);
|
||||
}
|
||||
|
||||
static u16 nvmet_set_feat_arbitration(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
|
||||
struct nvmet_feat_arbitration arb = {
|
||||
.hpw = (cdw11 >> 24) & 0xff,
|
||||
.mpw = (cdw11 >> 16) & 0xff,
|
||||
.lpw = (cdw11 >> 8) & 0xff,
|
||||
.ab = cdw11 & 0x3,
|
||||
};
|
||||
|
||||
if (!ctrl->ops->set_feature) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw10);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
return ctrl->ops->set_feature(ctrl, NVME_FEAT_ARBITRATION, &arb);
|
||||
}
|
||||
|
||||
void nvmet_execute_set_features(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_subsys *subsys = nvmet_req_subsys(req);
|
||||
@ -1118,6 +1357,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
|
||||
return;
|
||||
|
||||
switch (cdw10 & 0xff) {
|
||||
case NVME_FEAT_ARBITRATION:
|
||||
status = nvmet_set_feat_arbitration(req);
|
||||
break;
|
||||
case NVME_FEAT_NUM_QUEUES:
|
||||
ncqr = (cdw11 >> 16) & 0xffff;
|
||||
nsqr = cdw11 & 0xffff;
|
||||
@ -1128,6 +1370,12 @@ void nvmet_execute_set_features(struct nvmet_req *req)
|
||||
nvmet_set_result(req,
|
||||
(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
|
||||
break;
|
||||
case NVME_FEAT_IRQ_COALESCE:
|
||||
status = nvmet_set_feat_irq_coalesce(req);
|
||||
break;
|
||||
case NVME_FEAT_IRQ_CONFIG:
|
||||
status = nvmet_set_feat_irq_config(req);
|
||||
break;
|
||||
case NVME_FEAT_KATO:
|
||||
status = nvmet_set_feat_kato(req);
|
||||
break;
|
||||
@ -1135,7 +1383,7 @@ void nvmet_execute_set_features(struct nvmet_req *req)
|
||||
status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL);
|
||||
break;
|
||||
case NVME_FEAT_HOST_ID:
|
||||
status = NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR;
|
||||
status = nvmet_set_feat_host_id(req);
|
||||
break;
|
||||
case NVME_FEAT_WRITE_PROTECT:
|
||||
status = nvmet_set_feat_write_protect(req);
|
||||
@ -1172,6 +1420,79 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u16 nvmet_get_feat_irq_coalesce(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvmet_feat_irq_coalesce irqc = { };
|
||||
u16 status;
|
||||
|
||||
/*
|
||||
* This feature is not supported for fabrics controllers and mandatory
|
||||
* for PCI controllers.
|
||||
*/
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw10);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
status = ctrl->ops->get_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
nvmet_set_result(req, ((u32)irqc.time << 8) | (u32)irqc.thr);
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
static u16 nvmet_get_feat_irq_config(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
u32 iv = le32_to_cpu(req->cmd->common.cdw11) & 0xffff;
|
||||
struct nvmet_feat_irq_config irqcfg = { .iv = iv };
|
||||
u16 status;
|
||||
|
||||
/*
|
||||
* This feature is not supported for fabrics controllers and mandatory
|
||||
* for PCI controllers.
|
||||
*/
|
||||
if (!nvmet_is_pci_ctrl(ctrl)) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw10);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
status = ctrl->ops->get_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
nvmet_set_result(req, ((u32)irqcfg.cd << 16) | iv);
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
static u16 nvmet_get_feat_arbitration(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvmet_feat_arbitration arb = { };
|
||||
u16 status;
|
||||
|
||||
if (!ctrl->ops->get_feature) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, cdw10);
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
status = ctrl->ops->get_feature(ctrl, NVME_FEAT_ARBITRATION, &arb);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
nvmet_set_result(req,
|
||||
((u32)arb.hpw << 24) |
|
||||
((u32)arb.mpw << 16) |
|
||||
((u32)arb.lpw << 8) |
|
||||
(arb.ab & 0x3));
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
void nvmet_get_feat_kato(struct nvmet_req *req)
|
||||
{
|
||||
nvmet_set_result(req, req->sq->ctrl->kato * 1000);
|
||||
@ -1198,21 +1519,24 @@ void nvmet_execute_get_features(struct nvmet_req *req)
|
||||
* need to come up with some fake values for these.
|
||||
*/
|
||||
#if 0
|
||||
case NVME_FEAT_ARBITRATION:
|
||||
break;
|
||||
case NVME_FEAT_POWER_MGMT:
|
||||
break;
|
||||
case NVME_FEAT_TEMP_THRESH:
|
||||
break;
|
||||
case NVME_FEAT_ERR_RECOVERY:
|
||||
break;
|
||||
case NVME_FEAT_IRQ_COALESCE:
|
||||
break;
|
||||
case NVME_FEAT_IRQ_CONFIG:
|
||||
break;
|
||||
case NVME_FEAT_WRITE_ATOMIC:
|
||||
break;
|
||||
#endif
|
||||
case NVME_FEAT_ARBITRATION:
|
||||
status = nvmet_get_feat_arbitration(req);
|
||||
break;
|
||||
case NVME_FEAT_IRQ_COALESCE:
|
||||
status = nvmet_get_feat_irq_coalesce(req);
|
||||
break;
|
||||
case NVME_FEAT_IRQ_CONFIG:
|
||||
status = nvmet_get_feat_irq_config(req);
|
||||
break;
|
||||
case NVME_FEAT_ASYNC_EVENT:
|
||||
nvmet_get_feat_async_event(req);
|
||||
break;
|
||||
@ -1293,6 +1617,27 @@ out:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
u32 nvmet_admin_cmd_data_len(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
|
||||
if (nvme_is_fabrics(cmd))
|
||||
return nvmet_fabrics_admin_cmd_data_len(req);
|
||||
if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
|
||||
return nvmet_discovery_cmd_data_len(req);
|
||||
|
||||
switch (cmd->common.opcode) {
|
||||
case nvme_admin_get_log_page:
|
||||
return nvmet_get_log_page_len(cmd);
|
||||
case nvme_admin_identify:
|
||||
return NVME_IDENTIFY_DATA_SIZE;
|
||||
case nvme_admin_get_features:
|
||||
return nvmet_feat_data_len(req, le32_to_cpu(cmd->common.cdw10));
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
@ -1307,13 +1652,30 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
/* For PCI controllers, admin commands shall not use SGL. */
|
||||
if (nvmet_is_pci_ctrl(req->sq->ctrl) && !req->sq->qid &&
|
||||
cmd->common.flags & NVME_CMD_SGL_ALL)
|
||||
return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
|
||||
if (nvmet_is_passthru_req(req))
|
||||
return nvmet_parse_passthru_admin_cmd(req);
|
||||
|
||||
switch (cmd->common.opcode) {
|
||||
case nvme_admin_delete_sq:
|
||||
req->execute = nvmet_execute_delete_sq;
|
||||
return 0;
|
||||
case nvme_admin_create_sq:
|
||||
req->execute = nvmet_execute_create_sq;
|
||||
return 0;
|
||||
case nvme_admin_get_log_page:
|
||||
req->execute = nvmet_execute_get_log_page;
|
||||
return 0;
|
||||
case nvme_admin_delete_cq:
|
||||
req->execute = nvmet_execute_delete_cq;
|
||||
return 0;
|
||||
case nvme_admin_create_cq:
|
||||
req->execute = nvmet_execute_create_cq;
|
||||
return 0;
|
||||
case nvme_admin_identify:
|
||||
req->execute = nvmet_execute_identify;
|
||||
return 0;
|
||||
|
@ -37,6 +37,7 @@ static struct nvmet_type_name_map nvmet_transport[] = {
|
||||
{ NVMF_TRTYPE_RDMA, "rdma" },
|
||||
{ NVMF_TRTYPE_FC, "fc" },
|
||||
{ NVMF_TRTYPE_TCP, "tcp" },
|
||||
{ NVMF_TRTYPE_PCI, "pci" },
|
||||
{ NVMF_TRTYPE_LOOP, "loop" },
|
||||
};
|
||||
|
||||
@ -46,6 +47,7 @@ static const struct nvmet_type_name_map nvmet_addr_family[] = {
|
||||
{ NVMF_ADDR_FAMILY_IP6, "ipv6" },
|
||||
{ NVMF_ADDR_FAMILY_IB, "ib" },
|
||||
{ NVMF_ADDR_FAMILY_FC, "fc" },
|
||||
{ NVMF_ADDR_FAMILY_PCI, "pci" },
|
||||
{ NVMF_ADDR_FAMILY_LOOP, "loop" },
|
||||
};
|
||||
|
||||
@ -1400,6 +1402,49 @@ out_unlock:
|
||||
}
|
||||
CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max);
|
||||
|
||||
static ssize_t nvmet_subsys_attr_vendor_id_show(struct config_item *item,
|
||||
char *page)
|
||||
{
|
||||
return snprintf(page, PAGE_SIZE, "0x%x\n", to_subsys(item)->vendor_id);
|
||||
}
|
||||
|
||||
static ssize_t nvmet_subsys_attr_vendor_id_store(struct config_item *item,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
u16 vid;
|
||||
|
||||
if (kstrtou16(page, 0, &vid))
|
||||
return -EINVAL;
|
||||
|
||||
down_write(&nvmet_config_sem);
|
||||
to_subsys(item)->vendor_id = vid;
|
||||
up_write(&nvmet_config_sem);
|
||||
return count;
|
||||
}
|
||||
CONFIGFS_ATTR(nvmet_subsys_, attr_vendor_id);
|
||||
|
||||
static ssize_t nvmet_subsys_attr_subsys_vendor_id_show(struct config_item *item,
|
||||
char *page)
|
||||
{
|
||||
return snprintf(page, PAGE_SIZE, "0x%x\n",
|
||||
to_subsys(item)->subsys_vendor_id);
|
||||
}
|
||||
|
||||
static ssize_t nvmet_subsys_attr_subsys_vendor_id_store(struct config_item *item,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
u16 ssvid;
|
||||
|
||||
if (kstrtou16(page, 0, &ssvid))
|
||||
return -EINVAL;
|
||||
|
||||
down_write(&nvmet_config_sem);
|
||||
to_subsys(item)->subsys_vendor_id = ssvid;
|
||||
up_write(&nvmet_config_sem);
|
||||
return count;
|
||||
}
|
||||
CONFIGFS_ATTR(nvmet_subsys_, attr_subsys_vendor_id);
|
||||
|
||||
static ssize_t nvmet_subsys_attr_model_show(struct config_item *item,
|
||||
char *page)
|
||||
{
|
||||
@ -1628,6 +1673,8 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
|
||||
&nvmet_subsys_attr_attr_serial,
|
||||
&nvmet_subsys_attr_attr_cntlid_min,
|
||||
&nvmet_subsys_attr_attr_cntlid_max,
|
||||
&nvmet_subsys_attr_attr_vendor_id,
|
||||
&nvmet_subsys_attr_attr_subsys_vendor_id,
|
||||
&nvmet_subsys_attr_attr_model,
|
||||
&nvmet_subsys_attr_attr_qid_max,
|
||||
&nvmet_subsys_attr_attr_ieee_oui,
|
||||
@ -1782,6 +1829,7 @@ static struct config_group *nvmet_referral_make(
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
INIT_LIST_HEAD(&port->entry);
|
||||
port->disc_addr.trtype = NVMF_TRTYPE_MAX;
|
||||
config_group_init_type_name(&port->group, name, &nvmet_referral_type);
|
||||
|
||||
return &port->group;
|
||||
@ -2007,6 +2055,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
|
||||
port->inline_data_size = -1; /* < 0 == let the transport choose */
|
||||
port->max_queue_size = -1; /* < 0 == let the transport choose */
|
||||
|
||||
port->disc_addr.trtype = NVMF_TRTYPE_MAX;
|
||||
port->disc_addr.portid = cpu_to_le16(portid);
|
||||
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX;
|
||||
port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW;
|
||||
|
@ -836,6 +836,89 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
|
||||
complete(&sq->confirm_done);
|
||||
}
|
||||
|
||||
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid)
|
||||
{
|
||||
if (!ctrl->sqs)
|
||||
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
|
||||
if (cqid > ctrl->subsys->max_qid)
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
|
||||
/*
|
||||
* Note: For PCI controllers, the NVMe specifications allows multiple
|
||||
* SQs to share a single CQ. However, we do not support this yet, so
|
||||
* check that there is no SQ defined for a CQ. If one exist, then the
|
||||
* CQ ID is invalid for creation as well as when the CQ is being
|
||||
* deleted (as that would mean that the SQ was not deleted before the
|
||||
* CQ).
|
||||
*/
|
||||
if (ctrl->sqs[cqid])
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
|
||||
u16 qid, u16 size)
|
||||
{
|
||||
u16 status;
|
||||
|
||||
status = nvmet_check_cqid(ctrl, qid);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
nvmet_cq_setup(ctrl, cq, qid, size);
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_cq_create);
|
||||
|
||||
u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid,
|
||||
bool create)
|
||||
{
|
||||
if (!ctrl->sqs)
|
||||
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
|
||||
if (sqid > ctrl->subsys->max_qid)
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
|
||||
if ((create && ctrl->sqs[sqid]) ||
|
||||
(!create && !ctrl->sqs[sqid]))
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
|
||||
u16 sqid, u16 size)
|
||||
{
|
||||
u16 status;
|
||||
int ret;
|
||||
|
||||
if (!kref_get_unless_zero(&ctrl->ref))
|
||||
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
|
||||
status = nvmet_check_sqid(ctrl, sqid, true);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
ret = nvmet_sq_init(sq);
|
||||
if (ret) {
|
||||
status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
goto ctrl_put;
|
||||
}
|
||||
|
||||
nvmet_sq_setup(ctrl, sq, sqid, size);
|
||||
sq->ctrl = ctrl;
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
|
||||
ctrl_put:
|
||||
nvmet_ctrl_put(ctrl);
|
||||
return status;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_sq_create);
|
||||
|
||||
void nvmet_sq_destroy(struct nvmet_sq *sq)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = sq->ctrl;
|
||||
@ -929,6 +1012,33 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 nvmet_io_cmd_transfer_len(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
u32 metadata_len = 0;
|
||||
|
||||
if (nvme_is_fabrics(cmd))
|
||||
return nvmet_fabrics_io_cmd_data_len(req);
|
||||
|
||||
if (!req->ns)
|
||||
return 0;
|
||||
|
||||
switch (req->cmd->common.opcode) {
|
||||
case nvme_cmd_read:
|
||||
case nvme_cmd_write:
|
||||
case nvme_cmd_zone_append:
|
||||
if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns))
|
||||
metadata_len = nvmet_rw_metadata_len(req);
|
||||
return nvmet_rw_data_len(req) + metadata_len;
|
||||
case nvme_cmd_dsm:
|
||||
return nvmet_dsm_len(req);
|
||||
case nvme_cmd_zone_mgmt_recv:
|
||||
return (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
@ -1030,12 +1140,15 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
|
||||
/*
|
||||
* For fabrics, PSDT field shall describe metadata pointer (MPTR) that
|
||||
* contains an address of a single contiguous physical buffer that is
|
||||
* byte aligned.
|
||||
* byte aligned. For PCI controllers, this is optional so not enforced.
|
||||
*/
|
||||
if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, flags);
|
||||
status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
goto fail;
|
||||
if (!req->sq->ctrl || !nvmet_is_pci_ctrl(req->sq->ctrl)) {
|
||||
req->error_loc =
|
||||
offsetof(struct nvme_common_command, flags);
|
||||
status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(!req->sq->ctrl))
|
||||
@ -1077,11 +1190,27 @@ void nvmet_req_uninit(struct nvmet_req *req)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_req_uninit);
|
||||
|
||||
size_t nvmet_req_transfer_len(struct nvmet_req *req)
|
||||
{
|
||||
if (likely(req->sq->qid != 0))
|
||||
return nvmet_io_cmd_transfer_len(req);
|
||||
if (unlikely(!req->sq->ctrl))
|
||||
return nvmet_connect_cmd_data_len(req);
|
||||
return nvmet_admin_cmd_data_len(req);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_req_transfer_len);
|
||||
|
||||
bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len)
|
||||
{
|
||||
if (unlikely(len != req->transfer_len)) {
|
||||
u16 status;
|
||||
|
||||
req->error_loc = offsetof(struct nvme_common_command, dptr);
|
||||
nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR);
|
||||
if (req->cmd->common.flags & NVME_CMD_SGL_ALL)
|
||||
status = NVME_SC_SGL_INVALID_DATA;
|
||||
else
|
||||
status = NVME_SC_INVALID_FIELD;
|
||||
nvmet_req_complete(req, status | NVME_STATUS_DNR);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1092,8 +1221,14 @@ EXPORT_SYMBOL_GPL(nvmet_check_transfer_len);
|
||||
bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
|
||||
{
|
||||
if (unlikely(data_len > req->transfer_len)) {
|
||||
u16 status;
|
||||
|
||||
req->error_loc = offsetof(struct nvme_common_command, dptr);
|
||||
nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR);
|
||||
if (req->cmd->common.flags & NVME_CMD_SGL_ALL)
|
||||
status = NVME_SC_SGL_INVALID_DATA;
|
||||
else
|
||||
status = NVME_SC_INVALID_FIELD;
|
||||
nvmet_req_complete(req, status | NVME_STATUS_DNR);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1184,41 +1319,6 @@ void nvmet_req_free_sgls(struct nvmet_req *req)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_req_free_sgls);
|
||||
|
||||
static inline bool nvmet_cc_en(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_EN_SHIFT) & 0x1;
|
||||
}
|
||||
|
||||
static inline u8 nvmet_cc_css(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
|
||||
}
|
||||
|
||||
static inline u8 nvmet_cc_mps(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
|
||||
}
|
||||
|
||||
static inline u8 nvmet_cc_ams(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
|
||||
}
|
||||
|
||||
static inline u8 nvmet_cc_shn(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
|
||||
}
|
||||
|
||||
static inline u8 nvmet_cc_iosqes(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
|
||||
}
|
||||
|
||||
static inline u8 nvmet_cc_iocqes(u32 cc)
|
||||
{
|
||||
return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
|
||||
}
|
||||
|
||||
static inline bool nvmet_css_supported(u8 cc_css)
|
||||
{
|
||||
switch (cc_css << NVME_CC_CSS_SHIFT) {
|
||||
@ -1295,6 +1395,7 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
|
||||
ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
|
||||
mutex_unlock(&ctrl->lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_update_cc);
|
||||
|
||||
static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
|
||||
{
|
||||
@ -1402,15 +1503,15 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
|
||||
* Note: ctrl->subsys->lock should be held when calling this function
|
||||
*/
|
||||
static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
|
||||
struct nvmet_req *req)
|
||||
struct device *p2p_client)
|
||||
{
|
||||
struct nvmet_ns *ns;
|
||||
unsigned long idx;
|
||||
|
||||
if (!req->p2p_client)
|
||||
if (!p2p_client)
|
||||
return;
|
||||
|
||||
ctrl->p2p_client = get_device(req->p2p_client);
|
||||
ctrl->p2p_client = get_device(p2p_client);
|
||||
|
||||
nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns)
|
||||
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
|
||||
@ -1439,45 +1540,44 @@ static void nvmet_fatal_error_handler(struct work_struct *work)
|
||||
ctrl->ops->delete_ctrl(ctrl);
|
||||
}
|
||||
|
||||
u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
||||
struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
|
||||
uuid_t *hostid)
|
||||
struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
|
||||
{
|
||||
struct nvmet_subsys *subsys;
|
||||
struct nvmet_ctrl *ctrl;
|
||||
u32 kato = args->kato;
|
||||
u8 dhchap_status;
|
||||
int ret;
|
||||
u16 status;
|
||||
|
||||
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR;
|
||||
subsys = nvmet_find_get_subsys(req->port, subsysnqn);
|
||||
args->status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR;
|
||||
subsys = nvmet_find_get_subsys(args->port, args->subsysnqn);
|
||||
if (!subsys) {
|
||||
pr_warn("connect request for invalid subsystem %s!\n",
|
||||
subsysnqn);
|
||||
req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
|
||||
req->error_loc = offsetof(struct nvme_common_command, dptr);
|
||||
goto out;
|
||||
args->subsysnqn);
|
||||
args->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
|
||||
args->error_loc = offsetof(struct nvme_common_command, dptr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
down_read(&nvmet_config_sem);
|
||||
if (!nvmet_host_allowed(subsys, hostnqn)) {
|
||||
if (!nvmet_host_allowed(subsys, args->hostnqn)) {
|
||||
pr_info("connect by host %s for subsystem %s not allowed\n",
|
||||
hostnqn, subsysnqn);
|
||||
req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
|
||||
args->hostnqn, args->subsysnqn);
|
||||
args->result = IPO_IATTR_CONNECT_DATA(hostnqn);
|
||||
up_read(&nvmet_config_sem);
|
||||
status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR;
|
||||
req->error_loc = offsetof(struct nvme_common_command, dptr);
|
||||
args->status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR;
|
||||
args->error_loc = offsetof(struct nvme_common_command, dptr);
|
||||
goto out_put_subsystem;
|
||||
}
|
||||
up_read(&nvmet_config_sem);
|
||||
|
||||
status = NVME_SC_INTERNAL;
|
||||
args->status = NVME_SC_INTERNAL;
|
||||
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
|
||||
if (!ctrl)
|
||||
goto out_put_subsystem;
|
||||
mutex_init(&ctrl->lock);
|
||||
|
||||
ctrl->port = req->port;
|
||||
ctrl->ops = req->ops;
|
||||
ctrl->port = args->port;
|
||||
ctrl->ops = args->ops;
|
||||
|
||||
#ifdef CONFIG_NVME_TARGET_PASSTHRU
|
||||
/* By default, set loop targets to clear IDS by default */
|
||||
@ -1491,8 +1591,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
||||
INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
|
||||
INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
|
||||
|
||||
memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
|
||||
memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
|
||||
memcpy(ctrl->subsysnqn, args->subsysnqn, NVMF_NQN_SIZE);
|
||||
memcpy(ctrl->hostnqn, args->hostnqn, NVMF_NQN_SIZE);
|
||||
|
||||
kref_init(&ctrl->ref);
|
||||
ctrl->subsys = subsys;
|
||||
@ -1515,12 +1615,12 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
||||
subsys->cntlid_min, subsys->cntlid_max,
|
||||
GFP_KERNEL);
|
||||
if (ret < 0) {
|
||||
status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
|
||||
args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
|
||||
goto out_free_sqs;
|
||||
}
|
||||
ctrl->cntlid = ret;
|
||||
|
||||
uuid_copy(&ctrl->hostid, hostid);
|
||||
uuid_copy(&ctrl->hostid, args->hostid);
|
||||
|
||||
/*
|
||||
* Discovery controllers may use some arbitrary high value
|
||||
@ -1542,12 +1642,35 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
||||
if (ret)
|
||||
goto init_pr_fail;
|
||||
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
|
||||
nvmet_setup_p2p_ns_map(ctrl, req);
|
||||
nvmet_setup_p2p_ns_map(ctrl, args->p2p_client);
|
||||
nvmet_debugfs_ctrl_setup(ctrl);
|
||||
mutex_unlock(&subsys->lock);
|
||||
|
||||
*ctrlp = ctrl;
|
||||
return 0;
|
||||
if (args->hostid)
|
||||
uuid_copy(&ctrl->hostid, args->hostid);
|
||||
|
||||
dhchap_status = nvmet_setup_auth(ctrl);
|
||||
if (dhchap_status) {
|
||||
pr_err("Failed to setup authentication, dhchap status %u\n",
|
||||
dhchap_status);
|
||||
nvmet_ctrl_put(ctrl);
|
||||
if (dhchap_status == NVME_AUTH_DHCHAP_FAILURE_FAILED)
|
||||
args->status =
|
||||
NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR;
|
||||
else
|
||||
args->status = NVME_SC_INTERNAL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
args->status = NVME_SC_SUCCESS;
|
||||
|
||||
pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s.\n",
|
||||
nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
|
||||
ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
|
||||
ctrl->pi_support ? " T10-PI is enabled" : "",
|
||||
nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : "");
|
||||
|
||||
return ctrl;
|
||||
|
||||
init_pr_fail:
|
||||
mutex_unlock(&subsys->lock);
|
||||
@ -1561,9 +1684,9 @@ out_free_ctrl:
|
||||
kfree(ctrl);
|
||||
out_put_subsystem:
|
||||
nvmet_subsys_put(subsys);
|
||||
out:
|
||||
return status;
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
|
||||
|
||||
static void nvmet_ctrl_free(struct kref *ref)
|
||||
{
|
||||
@ -1599,6 +1722,7 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
|
||||
{
|
||||
kref_put(&ctrl->ref, nvmet_ctrl_free);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_ctrl_put);
|
||||
|
||||
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
|
||||
{
|
||||
|
@ -224,6 +224,9 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
|
||||
}
|
||||
|
||||
list_for_each_entry(r, &req->port->referrals, entry) {
|
||||
if (r->disc_addr.trtype == NVMF_TRTYPE_PCI)
|
||||
continue;
|
||||
|
||||
nvmet_format_discovery_entry(hdr, r,
|
||||
NVME_DISC_SUBSYS_NAME,
|
||||
r->disc_addr.traddr,
|
||||
@ -352,6 +355,20 @@ static void nvmet_execute_disc_get_features(struct nvmet_req *req)
|
||||
nvmet_req_complete(req, stat);
|
||||
}
|
||||
|
||||
u32 nvmet_discovery_cmd_data_len(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
|
||||
switch (cmd->common.opcode) {
|
||||
case nvme_admin_get_log_page:
|
||||
return nvmet_get_log_page_len(req->cmd);
|
||||
case nvme_admin_identify:
|
||||
return NVME_IDENTIFY_DATA_SIZE;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_command *cmd = req->cmd;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user