for-5.20/block-2022-08-04

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLsRfkQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpj43EADBydQhe7nQHH65gecqvttnio2GqEmcbozt
 lKFQlPPd3SHGMAJjSdR1dIwqtPsJ8q6xZXH+TjHhLXb2kgVu+TQ31krNHIqBwE14
 s7SsgGRgvopA46lSf/ls18/8sh6Yz1NgI39YcMVPjvkbLaVFK7zRkL9OSp4RQCwH
 u/IIHJmV415EeF6QNTgABBel/gEIPBLsvwOxTBIkzDOyUohtExZPYj83MDm7jdr3
 jsTUd2MiumNMh7ziMJIp1iN32nQOtIKtwWZaMHDCzfU/IUnBSmh2nj9oXr3+vcwo
 IsBMDUfUj9Eig5QQ/XcVIrFezi0GnunpBhScXPqL+dxPN812lzxNjkx6PsC+rPn8
 mWmXoaeK1ayoyotdHJlmINNmWUSCkOMwVnA2r1c4Hp4cQS5vRUtkKcpNLTpMhk4I
 OwQ3bjt9mA//WlH+apbhJqXqxjcoBwCwMoveJ4mHVtku9lo+JJAKVGdUs17QjZkC
 NxACP1MtBcXy1hurNQf14oH5C0Hyg4TBJShPauKmrqGtOFnbOAdX2qIhldvyNfH1
 l9cOvGNSgbQ6FLD6MVto6dC/KYOEM3LelVxgNB/80GbSmGwj88Kd/nzQLYFP89JJ
 0Wkt14mSkm82gabOvNqXGG8P8hLb/+v6sp4qZv0mf+op0xmb4FB5eaZvoceptVzM
 3Z+hmT7MfA==
 =pgNf
 -----END PGP SIGNATURE-----

Merge tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe:

 - NVMe pull requests via Christoph:
      - add support for In-Band authentication (Hannes Reinecke)
      - handle the persistent internal error AER (Michael Kelley)
      - use in-capsule data for TCP I/O queue connect (Caleb Sander)
      - remove timeout for getting RDMA-CM established event (Israel
        Rukshin)
      - misc cleanups (Joel Granados, Sagi Grimberg, Chaitanya Kulkarni,
        Guixin Liu, Xiang wangx)
      - use command_id instead of req->tag in trace_nvme_complete_rq()
        (Bean Huo)
      - various fixes for the new authentication code (Lukas Bulwahn,
        Dan Carpenter, Colin Ian King, Chaitanya Kulkarni, Hannes
        Reinecke)
      - small cleanups (Liu Song, Christoph Hellwig)
      - restore compat_ioctl support (Nick Bowler)
      - make a nvmet-tcp workqueue lockdep-safe (Sagi Grimberg)
      - enable generic interface (/dev/ngXnY) for unknown command sets
        (Joel Granados, Christoph Hellwig)
      - don't always build constants.o (Christoph Hellwig)
      - print the command name of aborted commands (Christoph Hellwig)

 - MD pull requests via Song:
      - Improve raid5 lock contention, by Logan Gunthorpe.
      - Misc fixes to raid5, by Logan Gunthorpe.
      - Fix race condition with md_reap_sync_thread(), by Guoqing Jiang.
      - Fix potential deadlock with raid5_quiesce and
        raid5_get_active_stripe, by Logan Gunthorpe.
      - Refactoring md_alloc(), by Christoph"
      - Fix md disk_name lifetime problems, by Christoph Hellwig
      - Convert prepare_to_wait() to wait_woken() api, by Logan
        Gunthorpe;
      - Fix sectors_to_do bitmap issue, by Logan Gunthorpe.

 - Work on unifying the null_blk module parameters and configfs API
   (Vincent)

 - drbd bitmap IO error fix (Lars)

 - Set of rnbd fixes (Guoqing, Md Haris)

 - Remove experimental marker on bcache async device registration (Coly)

 - Series from cleaning up the bio splitting (Christoph)

 - Removal of the sx8 block driver. This hardware never really
   widespread, and it didn't receive a lot of attention after the
   initial merge of it back in 2005 (Christoph)

 - A few fixes for s390 dasd (Eric, Jiang)

 - Followup set of fixes for ublk (Ming)

 - Support for UBLK_IO_NEED_GET_DATA for ublk (ZiyangZhang)

 - Fixes for the dio dma alignment (Keith)

 - Misc fixes and cleanups (Ming, Yu, Dan, Christophe

* tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block: (136 commits)
  s390/dasd: Establish DMA alignment
  s390/dasd: drop unexpected word 'for' in comments
  ublk_drv: add support for UBLK_IO_NEED_GET_DATA
  ublk_cmd.h: add one new ublk command: UBLK_IO_NEED_GET_DATA
  ublk_drv: cleanup ublksrv_ctrl_dev_info
  ublk_drv: add SET_PARAMS/GET_PARAMS control command
  ublk_drv: fix ublk device leak in case that add_disk fails
  ublk_drv: cancel device even though disk isn't up
  block: fix leaking page ref on truncated direct io
  block: ensure bio_iov_add_page can't fail
  block: ensure iov_iter advances for added pages
  drivers:md:fix a potential use-after-free bug
  md/raid5: Ensure batch_last is released before sleeping for quiesce
  md/raid5: Move stripe_request_ctx up
  md/raid5: Drop unnecessary call to r5c_check_stripe_cache_usage()
  md/raid5: Make is_inactive_blocked() helper
  md/raid5: Refactor raid5_get_active_stripe()
  block: pass struct queue_limits to the bio splitting helpers
  block: move bio_allowed_max_sectors to blk-merge.c
  block: move the call to get_max_io_size out of blk_bio_segment_split
  ...
This commit is contained in:
Linus Torvalds 2022-08-04 20:00:14 -07:00
commit fa9db655d0
86 changed files with 5659 additions and 2875 deletions

View File

@ -72,6 +72,28 @@ submit_queues=[1..nr_cpus]: Default: 1
hw_queue_depth=[0..qdepth]: Default: 64
The hardware queue depth of the device.
memory_backed=[0/1]: Default: 0
Whether or not to use a memory buffer to respond to IO requests
= =============================================
0 Transfer no data in response to IO requests
1 Use a memory buffer to respond to IO requests
= =============================================
discard=[0/1]: Default: 0
Support discard operations (requires memory-backed null_blk device).
= =====================================
0 Do not support discard operations
1 Enable support for discard operations
= =====================================
cache_size=[Size in MB]: Default: 0
Cache size in MB for memory-backed device.
mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit)
Bandwidth limit for device performance.
Multi-queue specific parameters
-------------------------------

View File

@ -14507,7 +14507,8 @@ S: Supported
W: http://git.infradead.org/nvme.git
T: git://git.infradead.org/nvme.git
F: drivers/nvme/host/
F: include/linux/nvme.h
F: drivers/nvme/common/
F: include/linux/nvme*
F: include/uapi/linux/nvme_ioctl.h
NVM EXPRESS FC TRANSPORT DRIVERS
@ -18838,6 +18839,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT
M: Song Liu <song@kernel.org>
L: linux-raid@vger.kernel.org
S: Supported
Q: https://patchwork.kernel.org/project/linux-raid/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
F: drivers/md/Kconfig
F: drivers/md/Makefile

View File

@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits,
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;

View File

@ -965,7 +965,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
* would create a gap, disallow it.
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bvec_gap_to_prev(q, bvec, offset))
if (bvec_gap_to_prev(&q->limits, bvec, offset))
return 0;
}
@ -1151,22 +1151,12 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
static void bio_put_pages(struct page **pages, size_t size, size_t off)
{
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
for (i = 0; i < nr; i++)
put_page(pages[i]);
}
static int bio_iov_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
bool same_page = false;
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
if (WARN_ON_ONCE(bio_full(bio, len)))
return -EINVAL;
__bio_add_page(bio, page, len, offset);
return 0;
}
@ -1209,8 +1199,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
ssize_t size, left;
unsigned len, i;
unsigned len, i = 0;
size_t offset;
int ret = 0;
/*
* Move page array up in the allocated memory for the bio vecs as far as
@ -1227,32 +1218,40 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* result to ensure the bio's total size is correct. The remainder of
* the iov data will be picked up in the next bio iteration.
*/
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
if (size > 0)
size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
nr_pages, &offset);
if (size > 0) {
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
if (unlikely(size <= 0))
return size ? size : -EFAULT;
} else
nr_pages = 0;
if (unlikely(size <= 0)) {
ret = size ? size : -EFAULT;
goto out;
}
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
int ret;
len = min_t(size_t, PAGE_SIZE - offset, left);
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
ret = bio_iov_add_zone_append_page(bio, page, len,
offset);
else
ret = bio_iov_add_page(bio, page, len, offset);
if (ret)
break;
} else
bio_iov_add_page(bio, page, len, offset);
if (ret) {
bio_put_pages(pages + i, left, offset);
return ret;
}
offset = 0;
}
iov_iter_advance(iter, size);
return 0;
iov_iter_advance(iter, size - left);
out:
while (i < nr_pages)
put_page(pages[i++]);
return ret;
}
/**

View File

@ -377,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
int ret;
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
GFP_KERNEL | __GFP_ZERO, node_id);
@ -396,13 +395,9 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
if (q->id < 0)
goto fail_srcu;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
goto fail_id;
q->stats = blk_alloc_queue_stats();
if (!q->stats)
goto fail_split;
goto fail_id;
q->node = node_id;
@ -439,8 +434,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
fail_stats:
blk_free_queue_stats(q->stats);
fail_split:
bioset_exit(&q->bio_split);
fail_id:
ida_free(&blk_queue_ida, q->id);
fail_srcu:

View File

@ -82,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q,
bio_get_first_bvec(next, &nb);
if (biovec_phys_mergeable(q, &pb, &nb))
return false;
return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
@ -95,23 +95,30 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
return bio_will_gap(req->q, NULL, bio, req->bio);
}
static struct bio *blk_bio_discard_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *nsegs)
/*
* The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
* is defined as 'unsigned int', meantime it has to be aligned to with the
* logical block size, which is the minimum accepted unit by hardware.
*/
static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
{
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
int alignment;
sector_t tmp;
unsigned split_sectors;
*nsegs = 1;
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
granularity = max(lim->discard_granularity >> 9, 1U);
max_discard_sectors = min(q->limits.max_discard_sectors,
bio_allowed_max_sectors(q));
max_discard_sectors =
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
@ -128,9 +135,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
* If the next starting sector would be misaligned, stop the discard at
* the previous aligned sector.
*/
alignment = (q->limits.discard_alignment >> 9) % granularity;
tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
tmp = bio->bi_iter.bi_sector + split_sectors -
((lim->discard_alignment >> 9) % granularity);
tmp = sector_div(tmp, granularity);
if (split_sectors > tmp)
@ -139,18 +145,15 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
struct bio *bio, struct bio_set *bs, unsigned *nsegs)
static struct bio *bio_split_write_zeroes(struct bio *bio,
struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
if (!q->limits.max_write_zeroes_sectors)
if (!lim->max_write_zeroes_sectors)
return NULL;
if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
return NULL;
return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
/*
@ -161,17 +164,17 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
* requests that are submitted to a block device if the start of a bio is not
* aligned to a physical block boundary.
*/
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
static inline unsigned get_max_io_size(struct bio *bio,
struct queue_limits *lim)
{
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
unsigned max_sectors = queue_max_sectors(q), start, end;
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
unsigned max_sectors = lim->max_sectors, start, end;
if (q->limits.chunk_sectors) {
if (lim->chunk_sectors) {
max_sectors = min(max_sectors,
blk_chunk_sectors_left(bio->bi_iter.bi_sector,
q->limits.chunk_sectors));
lim->chunk_sectors));
}
start = bio->bi_iter.bi_sector & (pbs - 1);
@ -181,11 +184,10 @@ static inline unsigned get_max_io_size(struct request_queue *q,
return max_sectors & ~(lbs - 1);
}
static inline unsigned get_max_segment_size(const struct request_queue *q,
struct page *start_page,
unsigned long offset)
static inline unsigned get_max_segment_size(struct queue_limits *lim,
struct page *start_page, unsigned long offset)
{
unsigned long mask = queue_segment_boundary(q);
unsigned long mask = lim->seg_boundary_mask;
offset = mask & (page_to_phys(start_page) + offset);
@ -194,12 +196,12 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* on 32bit arch, use queue's max segment size when that happens.
*/
return min_not_zero(mask - offset + 1,
(unsigned long)queue_max_segment_size(q));
(unsigned long)lim->max_segment_size);
}
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @q: [in] request queue associated with the bio associated with @bv
* @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
@ -217,10 +219,9 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
static bool bvec_split_segs(const struct request_queue *q,
const struct bio_vec *bv, unsigned *nsegs,
unsigned *bytes, unsigned max_segs,
unsigned max_bytes)
static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
unsigned *nsegs, unsigned *bytes, unsigned max_segs,
unsigned max_bytes)
{
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
@ -228,7 +229,7 @@ static bool bvec_split_segs(const struct request_queue *q,
unsigned seg_size = 0;
while (len && *nsegs < max_segs) {
seg_size = get_max_segment_size(q, bv->bv_page,
seg_size = get_max_segment_size(lim, bv->bv_page,
bv->bv_offset + total_len);
seg_size = min(seg_size, len);
@ -236,7 +237,7 @@ static bool bvec_split_segs(const struct request_queue *q,
total_len += seg_size;
len -= seg_size;
if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
break;
}
@ -247,16 +248,17 @@ static bool bvec_split_segs(const struct request_queue *q,
}
/**
* blk_bio_segment_split - split a bio in two bios
* @q: [in] request queue pointer
* bio_split_rw - split a bio in two bios
* @bio: [in] bio to be split
* @bs: [in] bio set to allocate the clone from
* @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
* @bs: [in] bio set to allocate the clone from
* @max_bytes: [in] maximum number of bytes per bio
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
* - That it has at most get_max_io_size(@q, @bio) sectors.
* - That it has at most @max_bytes worth of data
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
@ -265,33 +267,30 @@ static bool bvec_split_segs(const struct request_queue *q,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *segs)
static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, bytes = 0;
const unsigned max_bytes = get_max_io_size(q, bio) << 9;
const unsigned max_segs = queue_max_segments(q);
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
goto split;
if (nsegs < max_segs &&
if (nsegs < lim->max_segments &&
bytes + bv.bv_len <= max_bytes &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
bytes += bv.bv_len;
} else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
max_bytes)) {
goto split;
} else {
if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
lim->max_segments, max_bytes))
goto split;
}
bvprv = bv;
@ -308,7 +307,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* split size so that each bio is properly block size aligned, even if
* we do not use the full hardware limits.
*/
bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
@ -320,34 +319,35 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
}
/**
* __blk_queue_split - split a bio and submit the second half
* @q: [in] request_queue new bio is being queued at
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
* __bio_split_to_limits - split a bio to fit the queue limits
* @bio: bio to be split
* @lim: queue limits to split based on
* @nr_segs: returns the number of segments in the returned bio
*
* Split a bio into two bios, chain the two bios, submit the second half and
* store a pointer to the first half in *@bio. If the second bio is still too
* big it will be split by a recursive call to this function. Since this
* function may allocate a new bio from q->bio_split, it is the responsibility
* of the caller to ensure that q->bio_split is only released after processing
* of the split bio has finished.
* Check if @bio needs splitting based on the queue limits, and if so split off
* a bio fitting the limits from the beginning of @bio and return it. @bio is
* shortened to the remainder and re-submitted.
*
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
void __blk_queue_split(struct request_queue *q, struct bio **bio,
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
unsigned int *nr_segs)
{
struct bio *split = NULL;
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
struct bio *split;
switch (bio_op(*bio)) {
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
split = bio_split_discard(bio, lim, nr_segs, bs);
break;
case REQ_OP_WRITE_ZEROES:
split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
nr_segs);
split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
default:
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
split = bio_split_rw(bio, lim, nr_segs, bs,
get_max_io_size(bio, lim) << SECTOR_SHIFT);
break;
}
@ -356,32 +356,35 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
split->bi_opf |= REQ_NOMERGE;
blkcg_bio_issue_init(split);
bio_chain(split, *bio);
trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
submit_bio_noacct(bio);
return split;
}
return bio;
}
/**
* blk_queue_split - split a bio and submit the second half
* @bio: [in, out] bio to be split
* bio_split_to_limits - split a bio to fit the queue limits
* @bio: bio to be split
*
* Split a bio into two bios, chains the two bios, submit the second half and
* store a pointer to the first half in *@bio. Since this function may allocate
* a new bio from q->bio_split, it is the responsibility of the caller to ensure
* that q->bio_split is only released after processing of the split bio has
* finished.
* Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
* if so split off a bio fitting the limits from the beginning of @bio and
* return it. @bio is shortened to the remainder and re-submitted.
*
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
void blk_queue_split(struct bio **bio)
struct bio *bio_split_to_limits(struct bio *bio)
{
struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
if (blk_may_split(q, *bio))
__blk_queue_split(q, bio, &nr_segs);
if (bio_may_exceed_limits(bio, lim))
return __bio_split_to_limits(bio, lim, &nr_segs);
return bio;
}
EXPORT_SYMBOL(blk_queue_split);
EXPORT_SYMBOL(bio_split_to_limits);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
@ -411,7 +414,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
}
rq_for_each_bvec(bv, rq, iter)
bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
@ -442,8 +445,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
while (nbytes > 0) {
unsigned offset = bvec->bv_offset + total;
unsigned len = min(get_max_segment_size(q, bvec->bv_page,
offset), nbytes);
unsigned len = min(get_max_segment_size(&q->limits,
bvec->bv_page, offset), nbytes);
struct page *page = bvec->bv_page;
/*

View File

@ -2815,9 +2815,9 @@ void blk_mq_submit_bio(struct bio *bio)
unsigned int nr_segs = 1;
blk_status_t ret;
blk_queue_bounce(q, &bio);
if (blk_may_split(q, bio))
__blk_queue_split(q, &bio, &nr_segs);
bio = blk_queue_bounce(bio, q);
if (bio_may_exceed_limits(bio, &q->limits))
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio_integrity_prep(bio))
return;

View File

@ -779,8 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
if (queue_is_mq(q))
blk_mq_release(q);
bioset_exit(&q->bio_split);
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);

View File

@ -97,23 +97,23 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
static inline bool __bvec_gap_to_prev(struct request_queue *q,
static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
return (offset & queue_virt_boundary(q)) ||
((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
return (offset & lim->virt_boundary_mask) ||
((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}
/*
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
static inline bool bvec_gap_to_prev(struct request_queue *q,
static inline bool bvec_gap_to_prev(struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
if (!queue_virt_boundary(q))
if (!lim->virt_boundary_mask)
return false;
return __bvec_gap_to_prev(q, bprv, offset);
return __bvec_gap_to_prev(lim, bprv, offset);
}
static inline bool rq_mergeable(struct request *rq)
@ -189,7 +189,8 @@ static inline bool integrity_req_gap_back_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(req->bio);
struct bio_integrity_payload *bip_next = bio_integrity(next);
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
return bvec_gap_to_prev(&req->q->limits,
&bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@ -199,7 +200,8 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
return bvec_gap_to_prev(&req->q->limits,
&bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@ -288,7 +290,8 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
static inline bool bio_may_exceed_limits(struct bio *bio,
struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@ -307,12 +310,12 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
* to the performance impact of cloned bios themselves the loop below
* doesn't matter anyway.
*/
return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
return lim->chunk_sectors || bio->bi_vcnt != 1 ||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs);
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@ -344,16 +347,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
q->last_merge = NULL;
}
/*
* The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
* is defined as 'unsigned int', meantime it has to aligned to with logical
* block size which is the minimum accepted unit by hardware.
*/
static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
{
return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
}
/*
* Internal io_context interface
*/
@ -378,7 +371,7 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif
void __blk_queue_bounce(struct request_queue *q, struct bio **bio);
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
{
@ -387,10 +380,12 @@ static inline bool blk_queue_may_bounce(struct request_queue *q)
max_low_pfn >= max_pfn;
}
static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
static inline struct bio *blk_queue_bounce(struct bio *bio,
struct request_queue *q)
{
if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio)))
__blk_queue_bounce(q, bio);
if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
return __blk_queue_bounce(bio, q);
return bio;
}
#ifdef CONFIG_BLK_CGROUP_IOLATENCY

View File

@ -199,24 +199,24 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
return NULL;
}
void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
{
struct bio *bio;
int rw = bio_data_dir(*bio_orig);
int rw = bio_data_dir(bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
unsigned i = 0, bytes = 0;
bool bounce = false;
int sectors;
bio_for_each_segment(from, *bio_orig, iter) {
bio_for_each_segment(from, bio_orig, iter) {
if (i++ < BIO_MAX_VECS)
bytes += from.bv_len;
if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
return;
return bio_orig;
/*
* Individual bvecs might not be logical block aligned. Round down
@ -225,13 +225,13 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
*/
sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
SECTOR_SHIFT;
if (sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, *bio_orig);
submit_bio_noacct(*bio_orig);
*bio_orig = bio;
if (sectors < bio_sectors(bio_orig)) {
bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, bio_orig);
submit_bio_noacct(bio_orig);
bio_orig = bio;
}
bio = bounce_clone_bio(*bio_orig);
bio = bounce_clone_bio(bio_orig);
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
@ -254,7 +254,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
to->bv_page = bounce_page;
}
trace_block_bio_bounce(*bio_orig);
trace_block_bio_bounce(bio_orig);
bio->bi_flags |= (1 << BIO_BOUNCED);
@ -263,6 +263,6 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
else
bio->bi_end_io = bounce_end_io_write;
bio->bi_private = *bio_orig;
*bio_orig = bio;
bio->bi_private = bio_orig;
return bio;
}

View File

@ -1151,6 +1151,7 @@ static void disk_release(struct device *dev)
blk_mq_exit_queue(disk->queue);
blkcg_exit_queue(disk->queue);
bioset_exit(&disk->bio_split);
disk_release_events(disk);
kfree(disk->random);
@ -1342,9 +1343,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (!disk)
goto out_put_queue;
if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
goto out_free_disk;
disk->bdi = bdi_alloc(node_id);
if (!disk->bdi)
goto out_free_disk;
goto out_free_bioset;
/* bdev_alloc() might need the queue, set before the first call */
disk->queue = q;
@ -1382,6 +1386,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
iput(disk->part0->bd_inode);
out_free_bdi:
bdi_put(disk->bdi);
out_free_bioset:
bioset_exit(&disk->bio_split);
out_free_disk:
kfree(disk);
out_put_queue:

View File

@ -104,6 +104,12 @@ int crypto_grab_kpp(struct crypto_kpp_spawn *spawn,
}
EXPORT_SYMBOL_GPL(crypto_grab_kpp);
int crypto_has_kpp(const char *alg_name, u32 type, u32 mask)
{
return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_kpp);
static void kpp_prepare_alg(struct kpp_alg *alg)
{
struct crypto_alg *base = &alg->base;

View File

@ -521,6 +521,12 @@ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
}
EXPORT_SYMBOL_GPL(crypto_alloc_shash);
int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
{
return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_shash);
static int shash_prepare_alg(struct shash_alg *alg)
{
struct crypto_alg *base = &alg->base;

View File

@ -248,15 +248,6 @@ config BLK_DEV_NBD
If unsure, say N.
config BLK_DEV_SX8
tristate "Promise SATA SX8 support"
depends on PCI
help
Saying Y or M here will enable support for the
Promise SATA SX8 controllers.
Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
config BLK_DEV_RAM
tristate "RAM block device support"
help

View File

@ -26,8 +26,6 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/

View File

@ -974,25 +974,58 @@ static void drbd_bm_endio(struct bio *bio)
}
}
/* For the layout, see comment above drbd_md_set_sector_offsets(). */
static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev)
{
switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.al_offset -1;
case DRBD_MD_INDEX_FLEX_EXT:
default:
return bdev->md.md_offset + bdev->md.md_size_sect -1;
}
}
static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
{
struct drbd_device *device = ctx->device;
enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE;
struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
GFP_NOIO, &drbd_md_io_bio_set);
struct drbd_bitmap *b = device->bitmap;
struct bio *bio;
struct page *page;
sector_t last_bm_sect;
sector_t first_bm_sect;
sector_t on_disk_sector;
unsigned int len;
sector_t on_disk_sector =
device->ldev->md.md_offset + device->ldev->md.bm_offset;
on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset;
on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT));
/* this might happen with very small
* flexible external meta data device,
* or with PAGE_SIZE > 4k */
len = min_t(unsigned int, PAGE_SIZE,
(drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
last_bm_sect = drbd_md_last_bitmap_sector(device->ldev);
if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) {
sector_t len_sect = last_bm_sect - on_disk_sector + 1;
if (len_sect < PAGE_SIZE/SECTOR_SIZE)
len = (unsigned int)len_sect*SECTOR_SIZE;
else
len = PAGE_SIZE;
} else {
if (__ratelimit(&drbd_ratelimit_state)) {
drbd_err(device, "Invalid offset during on-disk bitmap access: "
"page idx %u, sector %llu\n", page_nr, on_disk_sector);
}
ctx->error = -EIO;
bm_set_page_io_err(b->bm_pages[page_nr]);
if (atomic_dec_and_test(&ctx->in_flight)) {
ctx->done = 1;
wake_up(&device->misc_wait);
kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
}
return;
}
/* serialize IO on this page */
bm_page_lock_io(device, page_nr);
@ -1007,6 +1040,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
bm_store_page_idx(page, page_nr);
} else
page = b->bm_pages[page_nr];
bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO,
&drbd_md_io_bio_set);
bio->bi_iter.bi_sector = on_disk_sector;
/* bio_add_page of a single page to an empty bio will always succeed,
* according to api. Do we want to assert that? */

View File

@ -1608,7 +1608,7 @@ void drbd_submit_bio(struct bio *bio)
{
struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
/*
* what we "blindly" assume:

View File

@ -11,6 +11,8 @@
* (part of code stolen from loop.c)
*/
#define pr_fmt(fmt) "nbd: " fmt
#include <linux/major.h>
#include <linux/blkdev.h>
@ -1950,7 +1952,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
pr_err("nbd: device at index %d is going down\n",
pr_err("device at index %d is going down\n",
index);
return -EINVAL;
}
@ -1961,7 +1963,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
if (!nbd) {
nbd = nbd_dev_add(index, 2);
if (IS_ERR(nbd)) {
pr_err("nbd: failed to add new device\n");
pr_err("failed to add new device\n");
return PTR_ERR(nbd);
}
}

View File

@ -201,6 +201,22 @@ static bool g_use_per_node_hctx;
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
static bool g_memory_backed;
module_param_named(memory_backed, g_memory_backed, bool, 0444);
MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");
static bool g_discard;
module_param_named(discard, g_discard, bool, 0444);
MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");
static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
static unsigned int g_mbps;
module_param_named(mbps, g_mbps, uint, 0444);
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
static bool g_zoned;
module_param_named(zoned, g_zoned, bool, S_IRUGO);
MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
@ -409,6 +425,8 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@ -532,6 +550,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tag_bitmap,
NULL,
};
@ -588,7 +608,13 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
return snprintf(page, PAGE_SIZE,
"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n");
"badblocks,blocking,blocksize,cache_size,"
"completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
"zone_capacity,zone_max_active,zone_max_open,"
"zone_nr_conv,zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@ -650,6 +676,10 @@ static struct nullb_device *null_alloc_dev(void)
dev->irqmode = g_irqmode;
dev->hw_queue_depth = g_hw_queue_depth;
dev->blocking = g_blocking;
dev->memory_backed = g_memory_backed;
dev->discard = g_discard;
dev->cache_size = g_cache_size;
dev->mbps = g_mbps;
dev->use_per_node_hctx = g_use_per_node_hctx;
dev->zoned = g_zoned;
dev->zone_size = g_zone_size;
@ -658,6 +688,8 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_max_open = g_zone_max_open;
dev->zone_max_active = g_zone_max_active;
dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched;
dev->shared_tag_bitmap = g_shared_tag_bitmap;
return dev;
}
@ -1655,7 +1687,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
static void cleanup_queue(struct nullb_queue *nq)
{
kfree(nq->tag_map);
bitmap_free(nq->tag_map);
kfree(nq->cmds);
}
@ -1782,14 +1814,13 @@ static const struct block_device_operations null_rq_ops = {
static int setup_commands(struct nullb_queue *nq)
{
struct nullb_cmd *cmd;
int i, tag_size;
int i;
nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
if (!nq->cmds)
return -ENOMEM;
tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL);
nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
if (!nq->tag_map) {
kfree(nq->cmds);
return -ENOMEM;
@ -1866,31 +1897,48 @@ static int null_gendisk_register(struct nullb *nullb)
static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
{
unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
int hw_queues, numa_node;
unsigned int queue_depth;
int poll_queues;
set->ops = &null_mq_ops;
set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
g_submit_queues;
poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues;
if (poll_queues)
set->nr_hw_queues += poll_queues;
set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
g_hw_queue_depth;
set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
set->cmd_size = sizeof(struct nullb_cmd);
set->flags = BLK_MQ_F_SHOULD_MERGE;
if (g_no_sched)
set->flags |= BLK_MQ_F_NO_SCHED;
if (g_shared_tag_bitmap)
set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
set->driver_data = nullb;
if (poll_queues)
set->nr_maps = 3;
else
set->nr_maps = 1;
if (nullb) {
hw_queues = nullb->dev->submit_queues;
poll_queues = nullb->dev->poll_queues;
queue_depth = nullb->dev->hw_queue_depth;
numa_node = nullb->dev->home_node;
if (nullb->dev->no_sched)
flags |= BLK_MQ_F_NO_SCHED;
if (nullb->dev->shared_tag_bitmap)
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (nullb->dev->blocking)
flags |= BLK_MQ_F_BLOCKING;
} else {
hw_queues = g_submit_queues;
poll_queues = g_poll_queues;
queue_depth = g_hw_queue_depth;
numa_node = g_home_node;
if (g_no_sched)
flags |= BLK_MQ_F_NO_SCHED;
if (g_shared_tag_bitmap)
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (g_blocking)
flags |= BLK_MQ_F_BLOCKING;
}
if ((nullb && nullb->dev->blocking) || g_blocking)
set->flags |= BLK_MQ_F_BLOCKING;
set->ops = &null_mq_ops;
set->cmd_size = sizeof(struct nullb_cmd);
set->flags = flags;
set->driver_data = nullb;
set->nr_hw_queues = hw_queues;
set->queue_depth = queue_depth;
set->numa_node = numa_node;
if (poll_queues) {
set->nr_hw_queues += poll_queues;
set->nr_maps = 3;
} else {
set->nr_maps = 1;
}
return blk_mq_alloc_tag_set(set);
}
@ -2042,8 +2090,13 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
mutex_lock(&lock);
nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
dev->index = nullb->index;
rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
if (rv < 0) {
mutex_unlock(&lock);
goto out_cleanup_zone;
}
nullb->index = rv;
dev->index = rv;
mutex_unlock(&lock);
blk_queue_logical_block_size(nullb->q, dev->blocksize);
@ -2069,7 +2122,7 @@ static int null_add_dev(struct nullb_device *dev)
rv = null_gendisk_register(nullb);
if (rv)
goto out_cleanup_zone;
goto out_ida_free;
mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
@ -2078,6 +2131,9 @@ static int null_add_dev(struct nullb_device *dev)
pr_info("disk %s created\n", nullb->disk_name);
return 0;
out_ida_free:
ida_free(&nullb_indexes, nullb->index);
out_cleanup_zone:
null_free_zoned_dev(dev);
out_cleanup_disk:

View File

@ -113,6 +113,8 @@ struct nullb_device {
bool discard; /* if support discard */
bool zoned; /* if device is zoned */
bool virt_boundary; /* virtual boundary on/off for the device */
bool no_sched; /* no IO scheduler for the device */
bool shared_tag_bitmap; /* use hostwide shared tags */
};
struct nullb {

View File

@ -2399,7 +2399,7 @@ static void pkt_submit_bio(struct bio *bio)
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
struct bio *split;
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
(unsigned long long)bio->bi_iter.bi_sector,

View File

@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio)
dev_dbg(&dev->core, "%s\n", __func__);
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
spin_lock_irq(&priv->lock);
busy = !bio_list_empty(&priv->list);

View File

@ -376,7 +376,7 @@ static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj,
if (ret)
return ret;
ret = rnbd_clt_resize_disk(dev, (size_t)sectors);
ret = rnbd_clt_resize_disk(dev, sectors);
if (ret)
return ret;

View File

@ -68,39 +68,18 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
return refcount_inc_not_zero(&dev->refcount);
}
static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
const struct rnbd_msg_open_rsp *rsp)
static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
sector_t new_nsectors)
{
struct rnbd_clt_session *sess = dev->sess;
if (get_capacity(dev->gd) == new_nsectors)
return;
if (!rsp->logical_block_size)
return -EINVAL;
dev->device_id = le32_to_cpu(rsp->device_id);
dev->nsectors = le64_to_cpu(rsp->nsectors);
dev->logical_block_size = le16_to_cpu(rsp->logical_block_size);
dev->physical_block_size = le16_to_cpu(rsp->physical_block_size);
dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors);
dev->discard_granularity = le32_to_cpu(rsp->discard_granularity);
dev->discard_alignment = le32_to_cpu(rsp->discard_alignment);
dev->secure_discard = le16_to_cpu(rsp->secure_discard);
dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK);
dev->fua = !!(rsp->cache_policy & RNBD_FUA);
dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
dev->max_segments = sess->max_segments;
return 0;
}
static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
size_t new_nsectors)
{
rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
dev->nsectors, new_nsectors);
dev->nsectors = new_nsectors;
set_capacity_and_notify(dev->gd, dev->nsectors);
return 0;
/*
* If the size changed, we need to revalidate it
*/
rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n",
get_capacity(dev->gd), new_nsectors);
set_capacity_and_notify(dev->gd, new_nsectors);
}
static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
@ -119,19 +98,16 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
u64 nsectors = le64_to_cpu(rsp->nsectors);
/*
* If the device was remapped and the size changed in the
* meantime we need to revalidate it
*/
if (dev->nsectors != nsectors)
rnbd_clt_change_capacity(dev, nsectors);
rnbd_clt_change_capacity(dev, nsectors);
gd_kobj = &disk_to_dev(dev->gd)->kobj;
kobject_uevent(gd_kobj, KOBJ_ONLINE);
rnbd_clt_info(dev, "Device online, device remapped successfully\n");
}
err = rnbd_clt_set_dev_attr(dev, rsp);
if (err)
if (!rsp->logical_block_size) {
err = -EINVAL;
goto out;
}
dev->device_id = le32_to_cpu(rsp->device_id);
dev->dev_state = DEV_STATE_MAPPED;
out:
@ -140,7 +116,7 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
return err;
}
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize)
{
int ret = 0;
@ -150,7 +126,7 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
ret = -ENOENT;
goto out;
}
ret = rnbd_clt_change_capacity(dev, newsize);
rnbd_clt_change_capacity(dev, newsize);
out:
mutex_unlock(&dev->lock);
@ -507,6 +483,11 @@ static void msg_open_conf(struct work_struct *work)
struct rnbd_msg_open_rsp *rsp = iu->buf;
struct rnbd_clt_dev *dev = iu->dev;
int errno = iu->errno;
bool from_map = false;
/* INIT state is only triggered from rnbd_clt_map_device */
if (dev->dev_state == DEV_STATE_INIT)
from_map = true;
if (errno) {
rnbd_clt_err(dev,
@ -523,7 +504,9 @@ static void msg_open_conf(struct work_struct *work)
send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT);
}
}
kfree(rsp);
/* We free rsp in rnbd_clt_map_device for map scenario */
if (!from_map)
kfree(rsp);
wake_up_iu_comp(iu, errno);
rnbd_put_iu(dev->sess, iu);
rnbd_clt_put_dev(dev);
@ -942,7 +925,7 @@ static int rnbd_client_open(struct block_device *block_device, fmode_t mode)
{
struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
if (dev->read_only && (mode & FMODE_WRITE))
if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE))
return -EPERM;
if (dev->dev_state == DEV_STATE_UNMAPPED ||
@ -963,10 +946,10 @@ static int rnbd_client_getgeo(struct block_device *block_device,
struct hd_geometry *geo)
{
u64 size;
struct rnbd_clt_dev *dev;
struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
struct queue_limits *limit = &dev->queue->limits;
dev = block_device->bd_disk->private_data;
size = dev->size * (dev->logical_block_size / SECTOR_SIZE);
size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
geo->cylinders = size >> 6; /* size/64 */
geo->heads = 4;
geo->sectors = 16;
@ -1350,11 +1333,15 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
}
}
static void setup_request_queue(struct rnbd_clt_dev *dev)
static void setup_request_queue(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp)
{
blk_queue_logical_block_size(dev->queue, dev->logical_block_size);
blk_queue_physical_block_size(dev->queue, dev->physical_block_size);
blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors);
blk_queue_logical_block_size(dev->queue,
le16_to_cpu(rsp->logical_block_size));
blk_queue_physical_block_size(dev->queue,
le16_to_cpu(rsp->physical_block_size));
blk_queue_max_hw_sectors(dev->queue,
dev->sess->max_io_size / SECTOR_SIZE);
/*
* we don't support discards to "discontiguous" segments
@ -1362,21 +1349,27 @@ static void setup_request_queue(struct rnbd_clt_dev *dev)
*/
blk_queue_max_discard_segments(dev->queue, 1);
blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors);
dev->queue->limits.discard_granularity = dev->discard_granularity;
dev->queue->limits.discard_alignment = dev->discard_alignment;
if (dev->secure_discard)
blk_queue_max_discard_sectors(dev->queue,
le32_to_cpu(rsp->max_discard_sectors));
dev->queue->limits.discard_granularity =
le32_to_cpu(rsp->discard_granularity);
dev->queue->limits.discard_alignment =
le32_to_cpu(rsp->discard_alignment);
if (le16_to_cpu(rsp->secure_discard))
blk_queue_max_secure_erase_sectors(dev->queue,
dev->max_discard_sectors);
le32_to_cpu(rsp->max_discard_sectors));
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
blk_queue_max_segments(dev->queue, dev->max_segments);
blk_queue_max_segments(dev->queue, dev->sess->max_segments);
blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
blk_queue_write_cache(dev->queue, dev->wc, dev->fua);
blk_queue_write_cache(dev->queue,
!!(rsp->cache_policy & RNBD_WRITEBACK),
!!(rsp->cache_policy & RNBD_FUA));
}
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp, int idx)
{
int err;
@ -1388,19 +1381,15 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
dev->gd->private_data = dev;
snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d",
idx);
pr_debug("disk_name=%s, capacity=%zu\n",
pr_debug("disk_name=%s, capacity=%llu\n",
dev->gd->disk_name,
dev->nsectors * (dev->logical_block_size / SECTOR_SIZE)
);
le64_to_cpu(rsp->nsectors) *
(le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE));
set_capacity(dev->gd, dev->nsectors);
set_capacity(dev->gd, le64_to_cpu(rsp->nsectors));
if (dev->access_mode == RNBD_ACCESS_RO) {
dev->read_only = true;
if (dev->access_mode == RNBD_ACCESS_RO)
set_disk_ro(dev->gd, true);
} else {
dev->read_only = false;
}
/*
* Network device does not need rotational
@ -1413,11 +1402,13 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
return err;
}
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp)
{
int idx = dev->clt_device_id;
dev->size = dev->nsectors * dev->logical_block_size;
dev->size = le64_to_cpu(rsp->nsectors) *
le16_to_cpu(rsp->logical_block_size);
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
if (IS_ERR(dev->gd))
@ -1425,8 +1416,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev)
dev->queue = dev->gd->queue;
rnbd_init_mq_hw_queues(dev);
setup_request_queue(dev);
return rnbd_clt_setup_gen_disk(dev, idx);
setup_request_queue(dev, rsp);
return rnbd_clt_setup_gen_disk(dev, rsp, idx);
}
static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
@ -1562,7 +1553,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
{
struct rnbd_clt_session *sess;
struct rnbd_clt_dev *dev;
int ret;
int ret, errno;
struct rnbd_msg_open_rsp *rsp;
struct rnbd_msg_open msg;
struct rnbd_iu *iu;
struct kvec vec = {
.iov_base = &msg,
.iov_len = sizeof(msg)
};
if (exists_devpath(pathname, sessname))
return ERR_PTR(-EEXIST);
@ -1582,17 +1580,47 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
ret = -EEXIST;
goto put_dev;
}
ret = send_msg_open(dev, RTRS_PERMIT_WAIT);
rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
if (!rsp) {
ret = -ENOMEM;
goto del_dev;
}
iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
if (!iu) {
ret = -ENOMEM;
kfree(rsp);
goto del_dev;
}
iu->buf = rsp;
iu->dev = dev;
sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN);
msg.access_mode = dev->access_mode;
strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
WARN_ON(!rnbd_clt_get_dev(dev));
ret = send_usr_msg(sess->rtrs, READ, iu,
&vec, sizeof(*rsp), iu->sgt.sgl, 1,
msg_open_conf, &errno, RTRS_PERMIT_WAIT);
if (ret) {
rnbd_clt_put_dev(dev);
rnbd_put_iu(sess, iu);
} else {
ret = errno;
}
if (ret) {
rnbd_clt_err(dev,
"map_device: failed, can't open remote device, err: %d\n",
ret);
goto del_dev;
goto put_iu;
}
mutex_lock(&dev->lock);
pr_debug("Opened remote device: session=%s, path='%s'\n",
sess->sessname, pathname);
ret = rnbd_client_setup_device(dev);
ret = rnbd_client_setup_device(dev, rsp);
if (ret) {
rnbd_clt_err(dev,
"map_device: Failed to configure device, err: %d\n",
@ -1602,21 +1630,30 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
}
rnbd_clt_info(dev,
"map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
dev->gd->disk_name, dev->nsectors,
dev->logical_block_size, dev->physical_block_size,
dev->max_discard_sectors,
dev->discard_granularity, dev->discard_alignment,
dev->secure_discard, dev->max_segments,
dev->max_hw_sectors, dev->wc, dev->fua);
"map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
dev->gd->disk_name, le64_to_cpu(rsp->nsectors),
le16_to_cpu(rsp->logical_block_size),
le16_to_cpu(rsp->physical_block_size),
le32_to_cpu(rsp->max_discard_sectors),
le32_to_cpu(rsp->discard_granularity),
le32_to_cpu(rsp->discard_alignment),
le16_to_cpu(rsp->secure_discard),
sess->max_segments, sess->max_io_size / SECTOR_SIZE,
!!(rsp->cache_policy & RNBD_WRITEBACK),
!!(rsp->cache_policy & RNBD_FUA));
mutex_unlock(&dev->lock);
kfree(rsp);
rnbd_put_iu(sess, iu);
rnbd_clt_put_sess(sess);
return dev;
send_close:
send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
put_iu:
kfree(rsp);
rnbd_put_iu(sess, iu);
del_dev:
delete_dev(dev);
put_dev:

View File

@ -106,6 +106,7 @@ struct rnbd_queue {
};
struct rnbd_clt_dev {
struct kobject kobj;
struct rnbd_clt_session *sess;
struct request_queue *queue;
struct rnbd_queue *hw_queues;
@ -114,27 +115,14 @@ struct rnbd_clt_dev {
u32 clt_device_id;
struct mutex lock;
enum rnbd_clt_dev_state dev_state;
refcount_t refcount;
char *pathname;
enum rnbd_access_mode access_mode;
u32 nr_poll_queues;
bool read_only;
bool wc;
bool fua;
u32 max_hw_sectors;
u32 max_discard_sectors;
u32 discard_granularity;
u32 discard_alignment;
u16 secure_discard;
u16 physical_block_size;
u16 logical_block_size;
u16 max_segments;
size_t nsectors;
u64 size; /* device size in bytes */
struct list_head list;
struct gendisk *gd;
struct kobject kobj;
char *blk_symlink_name;
refcount_t refcount;
struct work_struct unmap_on_rmmod_work;
};
@ -150,7 +138,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
const struct attribute *sysfs_self);
int rnbd_clt_remap_device(struct rnbd_clt_dev *dev);
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize);
int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize);
/* rnbd-clt-sysfs.c */

View File

@ -224,7 +224,6 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
wait_for_completion(&dc); /* wait for inflights to drop to zero */
rnbd_dev_close(sess_dev->rnbd_dev);
list_del(&sess_dev->sess_list);
mutex_lock(&sess_dev->dev->lock);
list_del(&sess_dev->dev_list);
if (sess_dev->open_flags & FMODE_WRITE)
@ -239,14 +238,14 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
static void destroy_sess(struct rnbd_srv_session *srv_sess)
{
struct rnbd_srv_sess_dev *sess_dev, *tmp;
struct rnbd_srv_sess_dev *sess_dev;
unsigned long index;
if (list_empty(&srv_sess->sess_dev_list))
if (xa_empty(&srv_sess->index_idr))
goto out;
mutex_lock(&srv_sess->lock);
list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list,
sess_list)
xa_for_each(&srv_sess->index_idr, index, sess_dev)
rnbd_srv_destroy_dev_session_sysfs(sess_dev);
mutex_unlock(&srv_sess->lock);
@ -281,7 +280,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
INIT_LIST_HEAD(&srv_sess->sess_dev_list);
mutex_init(&srv_sess->lock);
mutex_lock(&sess_lock);
list_add(&srv_sess->list, &sess_list);
@ -323,10 +321,11 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
{
struct rnbd_srv_session *sess = sess_dev->sess;
sess_dev->keep_id = true;
/* It is already started to close by client's close message. */
if (!mutex_trylock(&sess->lock))
return;
sess_dev->keep_id = true;
/* first remove sysfs itself to avoid deadlock */
sysfs_remove_file_self(&sess_dev->kobj, &attr->attr);
rnbd_srv_destroy_dev_session_sysfs(sess_dev);
@ -666,11 +665,12 @@ static struct rnbd_srv_sess_dev *
find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name)
{
struct rnbd_srv_sess_dev *sess_dev;
unsigned long index;
if (list_empty(&srv_sess->sess_dev_list))
if (xa_empty(&srv_sess->index_idr))
return NULL;
list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list)
xa_for_each(&srv_sess->index_idr, index, sess_dev)
if (!strcmp(sess_dev->pathname, dev_name))
return sess_dev;
@ -780,8 +780,6 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list);
mutex_unlock(&srv_dev->lock);
list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list);
rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id);
kfree(full_path);

View File

@ -25,8 +25,6 @@ struct rnbd_srv_session {
int queue_depth;
struct xarray index_idr;
/* List of struct rnbd_srv_sess_dev */
struct list_head sess_dev_list;
struct mutex lock;
u8 ver;
};
@ -48,8 +46,6 @@ struct rnbd_srv_dev {
struct rnbd_srv_sess_dev {
/* Entry inside rnbd_srv_dev struct */
struct list_head dev_list;
/* Entry inside rnbd_srv_session struct */
struct list_head sess_list;
struct rnbd_dev *rnbd_dev;
struct rnbd_srv_session *sess;
struct rnbd_srv_dev *dev;

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,12 @@
#define UBLK_MINORS (1U << MINORBITS)
/* All UBLK_F_* have to be included into UBLK_F_ALL */
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK)
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
| UBLK_F_URING_CMD_COMP_IN_TASK \
| UBLK_F_NEED_GET_DATA)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
struct ublk_rq_data {
struct callback_head work;
@ -86,6 +91,15 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_ABORTED 0x04
/*
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
* get data buffer address from ublksrv.
*
* Then, bio data could be copied into this data buffer for a WRITE request
* after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
struct ublk_io {
/* userspace buffer address from io cmd */
__u64 addr;
@ -119,7 +133,6 @@ struct ublk_device {
char *__queues;
unsigned short queue_size;
unsigned short bs_shift;
struct ublksrv_ctrl_dev_info dev_info;
struct blk_mq_tag_set tag_set;
@ -137,6 +150,8 @@ struct ublk_device {
spinlock_t mm_lock;
struct mm_struct *mm;
struct ublk_params params;
struct completion completion;
unsigned int nr_queues_ready;
atomic_t nr_aborted_queues;
@ -149,6 +164,12 @@ struct ublk_device {
struct work_struct stop_work;
};
/* header of ublk_params */
struct ublk_params_header {
__u32 len;
__u32 types;
};
static dev_t ublk_chr_devt;
static struct class *ublk_chr_class;
@ -160,6 +181,90 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
static struct miscdevice ublk_misc;
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
{
struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_basic *p = &ub->params.basic;
blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
blk_queue_io_min(q, 1 << p->io_min_shift);
blk_queue_io_opt(q, 1 << p->io_opt_shift);
blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
p->attrs & UBLK_ATTR_FUA);
if (p->attrs & UBLK_ATTR_ROTATIONAL)
blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_max_hw_sectors(q, p->max_sectors);
blk_queue_chunk_sectors(q, p->chunk_sectors);
blk_queue_virt_boundary(q, p->virt_boundary_mask);
if (p->attrs & UBLK_ATTR_READ_ONLY)
set_disk_ro(ub->ub_disk, true);
set_capacity(ub->ub_disk, p->dev_sectors);
}
static void ublk_dev_param_discard_apply(struct ublk_device *ub)
{
struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_discard *p = &ub->params.discard;
q->limits.discard_alignment = p->discard_alignment;
q->limits.discard_granularity = p->discard_granularity;
blk_queue_max_discard_sectors(q, p->max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q,
p->max_write_zeroes_sectors);
blk_queue_max_discard_segments(q, p->max_discard_segments);
}
static int ublk_validate_params(const struct ublk_device *ub)
{
/* basic param is the only one which must be set */
if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
const struct ublk_param_basic *p = &ub->params.basic;
if (p->logical_bs_shift > PAGE_SHIFT)
return -EINVAL;
if (p->logical_bs_shift > p->physical_bs_shift)
return -EINVAL;
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
return -EINVAL;
} else
return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
const struct ublk_param_discard *p = &ub->params.discard;
/* So far, only support single segment discard */
if (p->max_discard_sectors && p->max_discard_segments != 1)
return -EINVAL;
if (!p->discard_granularity)
return -EINVAL;
}
return 0;
}
static int ublk_apply_params(struct ublk_device *ub)
{
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
return -EINVAL;
ublk_dev_param_basic_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub);
return 0;
}
static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
{
if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
@ -168,6 +273,13 @@ static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
return false;
}
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
{
if (ubq->flags & UBLK_F_NEED_GET_DATA)
return true;
return false;
}
static struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
@ -509,6 +621,21 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
}
}
static void ubq_complete_io_cmd(struct ublk_io *io, int res)
{
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
/*
* clear ACTIVE since we are done with this sqe/cmd slot
* We can only accept io cmd in case of being not active.
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
/* tell ublksrv one io request is coming */
io_uring_cmd_done(io->cmd, res, 0);
}
#define UBLK_REQUEUE_DELAY_MS 3
static inline void __ublk_rq_task_work(struct request *req)
@ -531,6 +658,30 @@ static inline void __ublk_rq_task_work(struct request *req)
return;
}
if (ublk_need_get_data(ubq) &&
(req_op(req) == REQ_OP_WRITE ||
req_op(req) == REQ_OP_FLUSH)) {
/*
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
* so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* and notify it.
*/
if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
__func__, io->cmd->cmd_op, ubq->q_id,
req->tag, io->flags);
ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
return;
}
/*
* We have handled UBLK_IO_NEED_GET_DATA command,
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
* do the copy work.
*/
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
}
mapped_bytes = ublk_map_io(ubq, req, io);
/* partially mapped, update io descriptor */
@ -553,17 +704,7 @@ static inline void __ublk_rq_task_work(struct request *req)
mapped_bytes >> 9;
}
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
/*
* clear ACTIVE since we are done with this sqe/cmd slot
* We can only accept io cmd in case of being not active.
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
/* tell ublksrv one io request is coming */
io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
}
static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
@ -788,16 +929,27 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
UBLK_DAEMON_MONITOR_PERIOD);
}
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
{
return ubq->nr_io_ready == ubq->q_depth;
}
static void ublk_cancel_queue(struct ublk_queue *ubq)
{
int i;
if (!ublk_queue_ready(ubq))
return;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
if (io->flags & UBLK_IO_FLAG_ACTIVE)
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
}
/* all io commands are canceled */
ubq->nr_io_ready = 0;
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
@ -818,19 +970,14 @@ static void ublk_stop_dev(struct ublk_device *ub)
del_gendisk(ub->ub_disk);
ub->dev_info.state = UBLK_S_DEV_DEAD;
ub->dev_info.ublksrv_pid = -1;
ublk_cancel_dev(ub);
put_disk(ub->ub_disk);
ub->ub_disk = NULL;
unlock:
ublk_cancel_dev(ub);
mutex_unlock(&ub->mutex);
cancel_delayed_work_sync(&ub->monitor_work);
}
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
{
return ubq->nr_io_ready == ubq->q_depth;
}
/* device can only be started after all IOs are ready */
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
{
@ -846,6 +993,25 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
mutex_unlock(&ub->mutex);
}
static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
int tag, struct io_uring_cmd *cmd)
{
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
if (ublk_can_use_task_work(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
/* should not fail since we call it just in ubq->ubq_daemon */
task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
} else {
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
pdu->req = req;
io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
}
}
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@ -884,6 +1050,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
goto out;
}
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
*/
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
^ (cmd_op == UBLK_IO_NEED_GET_DATA))
goto out;
switch (cmd_op) {
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
@ -917,6 +1091,14 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
io->cmd = cmd;
ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
io->addr = ub_cmd->addr;
io->cmd = cmd;
io->flags |= UBLK_IO_FLAG_ACTIVE;
ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag, cmd);
break;
default:
goto out;
}
@ -1083,13 +1265,13 @@ static void ublk_stop_work_fn(struct work_struct *work)
ublk_stop_dev(ub);
}
/* align maximum I/O size to PAGE_SIZE */
/* align max io buffer size with PAGE_SIZE */
static void ublk_align_max_io_size(struct ublk_device *ub)
{
unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift;
unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
ub->dev_info.rq_max_blocks =
round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift;
ub->dev_info.max_io_buf_bytes =
round_down(max_io_bytes, PAGE_SIZE);
}
static int ublk_add_tag_set(struct ublk_device *ub)
@ -1132,7 +1314,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
{
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
int ublksrv_pid = (int)header->data[0];
unsigned long dev_blocks = header->data[1];
struct ublk_device *ub;
struct gendisk *disk;
int ret = -EINVAL;
@ -1155,10 +1336,6 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
goto out_unlock;
}
/* We may get disk size updated */
if (dev_blocks)
ub->dev_info.dev_blocks = dev_blocks;
disk = blk_mq_alloc_disk(&ub->tag_set, ub);
if (IS_ERR(disk)) {
ret = PTR_ERR(disk);
@ -1168,27 +1345,28 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd)
disk->fops = &ub_fops;
disk->private_data = ub;
blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size);
blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size);
blk_queue_io_min(disk->queue, ub->dev_info.block_size);
blk_queue_max_hw_sectors(disk->queue,
ub->dev_info.rq_max_blocks << (ub->bs_shift - 9));
disk->queue->limits.discard_granularity = PAGE_SIZE;
blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9);
blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9);
set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->ub_disk = disk;
ret = ublk_apply_params(ub);
if (ret)
goto out_put_disk;
get_device(&ub->cdev_dev);
ret = add_disk(disk);
if (ret) {
put_disk(disk);
goto out_unlock;
/*
* Has to drop the reference since ->free_disk won't be
* called in case of add_disk failure.
*/
ublk_put_device(ub);
goto out_put_disk;
}
set_bit(UB_STATE_USED, &ub->state);
ub->dev_info.state = UBLK_S_DEV_LIVE;
out_put_disk:
if (ret)
put_disk(disk);
out_unlock:
mutex_unlock(&ub->mutex);
ublk_put_device(ub);
@ -1250,9 +1428,8 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
{
pr_devel("%s: dev id %d flags %llx\n", __func__,
info->dev_id, info->flags);
pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
info->nr_hw_queues, info->queue_depth,
info->block_size, info->dev_blocks);
pr_devel("\t nr_hw_queues %d queue_depth %d\n",
info->nr_hw_queues, info->queue_depth);
}
static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
@ -1312,7 +1489,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
ub->bs_shift = ilog2(ub->dev_info.block_size);
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
@ -1436,6 +1612,82 @@ static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd)
return ret;
}
static int ublk_ctrl_get_params(struct io_uring_cmd *cmd)
{
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublk_params_header ph;
struct ublk_device *ub;
int ret;
if (header->len <= sizeof(ph) || !header->addr)
return -EINVAL;
if (copy_from_user(&ph, argp, sizeof(ph)))
return -EFAULT;
if (ph.len > header->len || !ph.len)
return -EINVAL;
if (ph.len > sizeof(struct ublk_params))
ph.len = sizeof(struct ublk_params);
ub = ublk_get_device_from_id(header->dev_id);
if (!ub)
return -EINVAL;
mutex_lock(&ub->mutex);
if (copy_to_user(argp, &ub->params, ph.len))
ret = -EFAULT;
else
ret = 0;
mutex_unlock(&ub->mutex);
ublk_put_device(ub);
return ret;
}
static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
{
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublk_params_header ph;
struct ublk_device *ub;
int ret = -EFAULT;
if (header->len <= sizeof(ph) || !header->addr)
return -EINVAL;
if (copy_from_user(&ph, argp, sizeof(ph)))
return -EFAULT;
if (ph.len > header->len || !ph.len || !ph.types)
return -EINVAL;
if (ph.len > sizeof(struct ublk_params))
ph.len = sizeof(struct ublk_params);
ub = ublk_get_device_from_id(header->dev_id);
if (!ub)
return -EINVAL;
/* parameters can only be changed when device isn't live */
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
ret = -EACCES;
} else if (copy_from_user(&ub->params, argp, ph.len)) {
ret = -EFAULT;
} else {
/* clear all we don't support yet */
ub->params.types &= UBLK_PARAM_TYPE_ALL;
ret = ublk_validate_params(ub);
}
mutex_unlock(&ub->mutex);
ublk_put_device(ub);
return ret;
}
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
@ -1471,6 +1723,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_GET_QUEUE_AFFINITY:
ret = ublk_ctrl_get_queue_affinity(cmd);
break;
case UBLK_CMD_GET_PARAMS:
ret = ublk_ctrl_get_params(cmd);
break;
case UBLK_CMD_SET_PARAMS:
ret = ublk_ctrl_set_params(cmd);
break;
default:
break;
}

View File

@ -29,7 +29,7 @@ config BCACHE_CLOSURES_DEBUG
operations that get stuck.
config BCACHE_ASYNC_REGISTRATION
bool "Asynchronous device registration (EXPERIMENTAL)"
bool "Asynchronous device registration"
depends on BCACHE
help
Add a sysfs file /sys/fs/bcache/register_async. Writing registering

View File

@ -3728,6 +3728,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)

View File

@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
while (io) {
struct dm_io *next = io->next;
dm_io_rewind(io, &md->queue->bio_split);
dm_io_rewind(io, &md->disk->bio_split);
io->next = NULL;
__dm_io_complete(io, false);
@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
* Does the target need to split IO even further?
* - varied (per target) IO splitting is a tenet of DM; this
* explains why stacked chunk_sectors based splitting via
* blk_queue_split() isn't possible here.
* bio_split_to_limits() isn't possible here.
*/
if (!ti->max_io_len)
return len;
@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
is_abnormal = is_abnormal_io(bio);
if (unlikely(is_abnormal)) {
/*
* Use blk_queue_split() for abnormal IO (e.g. discard, etc)
* Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
*/
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
}
init_clone_info(&ci, md, map, bio, is_abnormal);

View File

@ -125,7 +125,6 @@ static void __init md_setup_drive(struct md_setup_args *args)
char *devname = args->device_names;
dev_t devices[MD_SB_DISKS + 1], mdev;
struct mdu_array_info_s ainfo = { };
struct block_device *bdev;
struct mddev *mddev;
int err = 0, i;
char name[16];
@ -169,24 +168,16 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_info("md: Loading %s: %s\n", name, args->device_names);
bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
if (IS_ERR(bdev)) {
pr_err("md: open failed - cannot start array %s\n", name);
mddev = md_alloc(mdev, name);
if (IS_ERR(mddev)) {
pr_err("md: md_alloc failed - cannot start array %s\n", name);
return;
}
err = -EIO;
if (WARN(bdev->bd_disk->fops != &md_fops,
"Opening block device %x resulted in non-md device\n",
mdev))
goto out_blkdev_put;
mddev = bdev->bd_disk->private_data;
err = mddev_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
goto out_blkdev_put;
goto out_mddev_put;
}
if (!list_empty(&mddev->disks) || mddev->raid_disks) {
@ -230,8 +221,8 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_warn("md: starting %s failed\n", name);
out_unlock:
mddev_unlock(mddev);
out_blkdev_put:
blkdev_put(bdev, FMODE_READ);
out_mddev_put:
mddev_put(mddev);
}
static int __init raid_setup(char *str)

View File

@ -40,7 +40,7 @@ struct resync_info {
/* Lock the send communication. This is done through
* bit manipulation as opposed to a mutex in order to
* accomodate lock and hold. See next comment.
* accommodate lock and hold. See next comment.
*/
#define MD_CLUSTER_SEND_LOCK 4
/* If cluster operations (such as adding a disk) must lock the
@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
/*
* If resync thread run after raid1d thread, then process_metadata_update
* could not continue if raid1d held reconfig_mutex (and raid1d is blocked
* since another node already got EX on Token and waitting the EX of Ack),
* since another node already got EX on Token and waiting the EX of Ack),
* so let resync wake up thread in case flag is set.
*/
if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,

View File

@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event);
static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
/*
* iterates through all used mddevs in the system.
* We take care to grab the all_mddevs_lock whenever navigating
* the list, and to always hold a refcount when unlocked.
* Any code which breaks out of this loop while own
* a reference to the current mddev and must mddev_put it.
*/
#define for_each_mddev(_mddev,_tmp) \
\
for (({ spin_lock(&all_mddevs_lock); \
_tmp = all_mddevs.next; \
_mddev = NULL;}); \
({ if (_tmp != &all_mddevs) \
mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
spin_unlock(&all_mddevs_lock); \
if (_mddev) mddev_put(_mddev); \
_mddev = list_entry(_tmp, struct mddev, all_mddevs); \
_tmp != &all_mddevs;}); \
({ spin_lock(&all_mddevs_lock); \
_tmp = _tmp->next;}) \
)
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
@ -464,7 +442,7 @@ static void md_submit_bio(struct bio *bio)
return;
}
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
@ -647,13 +625,17 @@ EXPORT_SYMBOL(md_flush_request);
static inline struct mddev *mddev_get(struct mddev *mddev)
{
lockdep_assert_held(&all_mddevs_lock);
if (test_bit(MD_DELETED, &mddev->flags))
return NULL;
atomic_inc(&mddev->active);
return mddev;
}
static void mddev_delayed_delete(struct work_struct *ws);
static void mddev_put(struct mddev *mddev)
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
@ -661,7 +643,7 @@ static void mddev_put(struct mddev *mddev)
mddev->ctime == 0 && !mddev->hold_active) {
/* Array is not configured at all, and not held active,
* so destroy it */
list_del_init(&mddev->all_mddevs);
set_bit(MD_DELETED, &mddev->flags);
/*
* Call queue_work inside the spinlock so that
@ -678,7 +660,6 @@ static void md_safemode_timeout(struct timer_list *t);
void mddev_init(struct mddev *mddev)
{
kobject_init(&mddev->kobj, &md_ktype);
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->bitmap_info.mutex);
@ -733,22 +714,6 @@ static dev_t mddev_alloc_unit(void)
return dev;
}
static struct mddev *mddev_find(dev_t unit)
{
struct mddev *mddev;
if (MAJOR(unit) != MD_MAJOR)
unit &= ~((1 << MdpMinorShift) - 1);
spin_lock(&all_mddevs_lock);
mddev = mddev_find_locked(unit);
if (mddev)
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
return mddev;
}
static struct mddev *mddev_alloc(dev_t unit)
{
struct mddev *new;
@ -791,6 +756,15 @@ static struct mddev *mddev_alloc(dev_t unit)
return ERR_PTR(error);
}
static void mddev_free(struct mddev *mddev)
{
spin_lock(&all_mddevs_lock);
list_del(&mddev->all_mddevs);
spin_unlock(&all_mddevs_lock);
kfree(mddev);
}
static const struct attribute_group md_redundancy_group;
void mddev_unlock(struct mddev *mddev)
@ -3335,14 +3309,35 @@ rdev_size_show(struct md_rdev *rdev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
}
static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
{
/* check if two start/length pairs overlap */
if (s1+l1 <= s2)
return 0;
if (s2+l2 <= s1)
return 0;
return 1;
if (a->data_offset + a->sectors <= b->data_offset)
return false;
if (b->data_offset + b->sectors <= a->data_offset)
return false;
return true;
}
static bool md_rdev_overlaps(struct md_rdev *rdev)
{
struct mddev *mddev;
struct md_rdev *rdev2;
spin_lock(&all_mddevs_lock);
list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (test_bit(MD_DELETED, &mddev->flags))
continue;
rdev_for_each(rdev2, mddev) {
if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
md_rdevs_overlap(rdev, rdev2)) {
spin_unlock(&all_mddevs_lock);
return true;
}
}
}
spin_unlock(&all_mddevs_lock);
return false;
}
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
@ -3394,46 +3389,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
return -EINVAL; /* component must fit device */
rdev->sectors = sectors;
if (sectors > oldsectors && my_mddev->external) {
/* Need to check that all other rdevs with the same
* ->bdev do not overlap. 'rcu' is sufficient to walk
* the rdev lists safely.
* This check does not provide a hard guarantee, it
* just helps avoid dangerous mistakes.
/*
* Check that all other rdevs with the same bdev do not overlap. This
* check does not provide a hard guarantee, it just helps avoid
* dangerous mistakes.
*/
if (sectors > oldsectors && my_mddev->external &&
md_rdev_overlaps(rdev)) {
/*
* Someone else could have slipped in a size change here, but
* doing so is just silly. We put oldsectors back because we
* know it is safe, and trust userspace not to race with itself.
*/
struct mddev *mddev;
int overlap = 0;
struct list_head *tmp;
rcu_read_lock();
for_each_mddev(mddev, tmp) {
struct md_rdev *rdev2;
rdev_for_each(rdev2, mddev)
if (rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
overlaps(rdev->data_offset, rdev->sectors,
rdev2->data_offset,
rdev2->sectors)) {
overlap = 1;
break;
}
if (overlap) {
mddev_put(mddev);
break;
}
}
rcu_read_unlock();
if (overlap) {
/* Someone else could have slipped in a size
* change here, but doing so is just silly.
* We put oldsectors back because we *know* it is
* safe, and trust userspace not to race with
* itself
*/
rdev->sectors = oldsectors;
return -EBUSY;
}
rdev->sectors = oldsectors;
return -EBUSY;
}
return len;
}
@ -4830,6 +4800,19 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
sector_t save_rp = mddev->reshape_position;
mddev_unlock(mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
mddev_lock_nointr(mddev);
/*
* set RECOVERY_INTR again and restore reshape
* position in case others changed them after
* got lock, eg, reshape_position_store and
* md_check_recovery.
*/
mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
@ -5001,7 +4984,7 @@ static ssize_t
sync_speed_show(struct mddev *mddev, char *page)
{
unsigned long resync, dt, db;
if (mddev->curr_resync == 0)
if (mddev->curr_resync == MD_RESYNC_NONE)
return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
@ -5020,8 +5003,8 @@ sync_completed_show(struct mddev *mddev, char *page)
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
if (mddev->curr_resync == 1 ||
mddev->curr_resync == 2)
if (mddev->curr_resync == MD_RESYNC_YIELDED ||
mddev->curr_resync == MD_RESYNC_DELAYED)
return sprintf(page, "delayed\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@ -5532,11 +5515,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
spin_lock(&all_mddevs_lock);
if (list_empty(&mddev->all_mddevs)) {
if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
return -EBUSY;
}
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
rv = entry->show(mddev, page);
@ -5557,18 +5539,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
spin_lock(&all_mddevs_lock);
if (list_empty(&mddev->all_mddevs)) {
if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
return -EBUSY;
}
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
return rv;
}
static void md_free(struct kobject *ko)
static void md_kobj_release(struct kobject *ko)
{
struct mddev *mddev = container_of(ko, struct mddev, kobj);
@ -5577,15 +5558,8 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
if (mddev->gendisk) {
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
percpu_ref_exit(&mddev->writes_pending);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
kfree(mddev);
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
static const struct sysfs_ops md_sysfs_ops = {
@ -5593,7 +5567,7 @@ static const struct sysfs_ops md_sysfs_ops = {
.store = md_attr_store,
};
static struct kobj_type md_ktype = {
.release = md_free,
.release = md_kobj_release,
.sysfs_ops = &md_sysfs_ops,
.default_groups = md_attr_groups,
};
@ -5604,7 +5578,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
kobject_del(&mddev->kobj);
kobject_put(&mddev->kobj);
}
@ -5623,7 +5596,7 @@ int mddev_init_writes_pending(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
static int md_alloc(dev_t dev, char *name)
struct mddev *md_alloc(dev_t dev, char *name)
{
/*
* If dev is zero, name is the name of a device to allocate with
@ -5651,8 +5624,8 @@ static int md_alloc(dev_t dev, char *name)
mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
if (IS_ERR(mddev)) {
mutex_unlock(&disks_mutex);
return PTR_ERR(mddev);
error = PTR_ERR(mddev);
goto out_unlock;
}
partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
@ -5670,7 +5643,7 @@ static int md_alloc(dev_t dev, char *name)
strcmp(mddev2->gendisk->disk_name, name) == 0) {
spin_unlock(&all_mddevs_lock);
error = -EEXIST;
goto out_unlock_disks_mutex;
goto out_free_mddev;
}
spin_unlock(&all_mddevs_lock);
}
@ -5683,7 +5656,7 @@ static int md_alloc(dev_t dev, char *name)
error = -ENOMEM;
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
goto out_unlock_disks_mutex;
goto out_free_mddev;
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
@ -5704,25 +5677,45 @@ static int md_alloc(dev_t dev, char *name)
mddev->gendisk = disk;
error = add_disk(disk);
if (error)
goto out_cleanup_disk;
goto out_put_disk;
kobject_init(&mddev->kobj, &md_ktype);
error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
if (error)
goto out_del_gendisk;
if (error) {
/*
* The disk is already live at this point. Clear the hold flag
* and let mddev_put take care of the deletion, as it isn't any
* different from a normal close on last release now.
*/
mddev->hold_active = 0;
mutex_unlock(&disks_mutex);
mddev_put(mddev);
return ERR_PTR(error);
}
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
goto out_unlock_disks_mutex;
out_del_gendisk:
del_gendisk(disk);
out_cleanup_disk:
put_disk(disk);
out_unlock_disks_mutex:
mutex_unlock(&disks_mutex);
return mddev;
out_put_disk:
put_disk(disk);
out_free_mddev:
mddev_free(mddev);
out_unlock:
mutex_unlock(&disks_mutex);
return ERR_PTR(error);
}
static int md_alloc_and_put(dev_t dev, char *name)
{
struct mddev *mddev = md_alloc(dev, name);
if (IS_ERR(mddev))
return PTR_ERR(mddev);
mddev_put(mddev);
return error;
return 0;
}
static void md_probe(dev_t dev)
@ -5730,7 +5723,7 @@ static void md_probe(dev_t dev)
if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
return;
if (create_on_open)
md_alloc(dev, NULL);
md_alloc_and_put(dev, NULL);
}
static int add_named_array(const char *val, const struct kernel_param *kp)
@ -5752,12 +5745,12 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
return -E2BIG;
strscpy(buf, val, len+1);
if (strncmp(buf, "md_", 3) == 0)
return md_alloc(0, buf);
return md_alloc_and_put(0, buf);
if (strncmp(buf, "md", 2) == 0 &&
isdigit(buf[2]) &&
kstrtoul(buf+2, 10, &devnum) == 0 &&
devnum <= MINORMASK)
return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
return -EINVAL;
}
@ -6197,6 +6190,7 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
@ -6244,11 +6238,11 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
md_bitmap_destroy(mddev);
mddev_detach(mddev);
/* Ensure ->event_work is done */
if (mddev->event_work.func)
flush_workqueue(md_misc_wq);
md_bitmap_destroy(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
spin_unlock(&mddev->lock);
@ -6497,9 +6491,8 @@ static void autorun_devices(int part)
break;
}
md_probe(dev);
mddev = mddev_find(dev);
if (!mddev)
mddev = md_alloc(dev, NULL);
if (IS_ERR(mddev))
break;
if (mddev_lock(mddev))
@ -7782,45 +7775,33 @@ static int md_set_read_only(struct block_device *bdev, bool ro)
static int md_open(struct block_device *bdev, fmode_t mode)
{
/*
* Succeed if we can lock the mddev, which confirms that
* it isn't being stopped right now.
*/
struct mddev *mddev = mddev_find(bdev->bd_dev);
struct mddev *mddev;
int err;
spin_lock(&all_mddevs_lock);
mddev = mddev_get(bdev->bd_disk->private_data);
spin_unlock(&all_mddevs_lock);
if (!mddev)
return -ENODEV;
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
*/
mddev_put(mddev);
/* Wait until bdev->bd_disk is definitely gone */
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
return -EBUSY;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
err = mutex_lock_interruptible(&mddev->open_mutex);
if (err)
goto out;
if (test_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex);
err = -ENODEV;
goto out;
}
err = -ENODEV;
if (test_bit(MD_CLOSING, &mddev->flags))
goto out_unlock;
err = 0;
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
bdev_check_media_change(bdev);
out:
if (err)
mddev_put(mddev);
return 0;
out_unlock:
mutex_unlock(&mddev->open_mutex);
out:
mddev_put(mddev);
return err;
}
@ -7844,6 +7825,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
return ret;
}
static void md_free_disk(struct gendisk *disk)
{
struct mddev *mddev = disk->private_data;
percpu_ref_exit(&mddev->writes_pending);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
mddev_free(mddev);
}
const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
@ -7857,6 +7849,7 @@ const struct block_device_operations md_fops =
.getgeo = md_getgeo,
.check_events = md_check_events,
.set_read_only = md_set_read_only,
.free_disk = md_free_disk,
};
static int md_thread(void *arg)
@ -8018,16 +8011,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
if (resync <= 3) {
if (resync < MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
} else if (resync > max_sectors)
} else if (resync > max_sectors) {
resync = max_sectors;
else
} else {
resync -= atomic_read(&mddev->recovery_active);
if (resync < MD_RESYNC_ACTIVE) {
/*
* Resync has started, but the subtraction has
* yielded one of the special values. Force it
* to active to ensure the status reports an
* active resync.
*/
resync = MD_RESYNC_ACTIVE;
}
}
if (resync == 0) {
if (resync == MD_RESYNC_NONE) {
if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
struct md_rdev *rdev;
@ -8051,7 +8054,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
return 0;
}
if (resync < 3) {
if (resync < MD_RESYNC_ACTIVE) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
@ -8152,6 +8155,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
if (!l--) {
mddev = list_entry(tmp, struct mddev, all_mddevs);
mddev_get(mddev);
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
return mddev;
}
@ -8165,25 +8170,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct list_head *tmp;
struct mddev *next_mddev, *mddev = v;
struct mddev *to_put = NULL;
++*pos;
if (v == (void*)2)
return NULL;
spin_lock(&all_mddevs_lock);
if (v == (void*)1)
if (v == (void*)1) {
tmp = all_mddevs.next;
else
} else {
to_put = mddev;
tmp = mddev->all_mddevs.next;
}
for (;;) {
if (tmp == &all_mddevs) {
next_mddev = (void*)2;
*pos = 0x10000;
break;
}
next_mddev = list_entry(tmp, struct mddev, all_mddevs);
if (mddev_get(next_mddev))
break;
mddev = next_mddev;
tmp = mddev->all_mddevs.next;
if (tmp != &all_mddevs)
next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
else {
next_mddev = (void*)2;
*pos = 0x10000;
}
spin_unlock(&all_mddevs_lock);
if (v != (void*)1)
if (to_put)
mddev_put(mddev);
return next_mddev;
@ -8682,7 +8697,6 @@ void md_do_sync(struct md_thread *thread)
unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
@ -8729,13 +8743,7 @@ void md_do_sync(struct md_thread *thread)
mddev->last_sync_action = action ?: desc;
/* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
* 1 == like 2, but have yielded to allow conflicting resync to
* commence
* other == active in resync - this many blocks
*
/*
* Before starting a resync we must have set curr_resync to
* 2, and then checked that every "conflicting" array has curr_resync
* less than ours. When we find one that is the same or higher
@ -8747,24 +8755,29 @@ void md_do_sync(struct md_thread *thread)
do {
int mddev2_minor = -1;
mddev->curr_resync = 2;
mddev->curr_resync = MD_RESYNC_DELAYED;
try_again:
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
for_each_mddev(mddev2, tmp) {
spin_lock(&all_mddevs_lock);
list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
if (test_bit(MD_DELETED, &mddev2->flags))
continue;
if (mddev2 == mddev)
continue;
if (!mddev->parallel_resync
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
if (mddev < mddev2 && mddev->curr_resync == 2) {
if (mddev < mddev2 &&
mddev->curr_resync == MD_RESYNC_DELAYED) {
/* arbitrarily yield */
mddev->curr_resync = 1;
mddev->curr_resync = MD_RESYNC_YIELDED;
wake_up(&resync_wait);
}
if (mddev > mddev2 && mddev->curr_resync == 1)
if (mddev > mddev2 &&
mddev->curr_resync == MD_RESYNC_YIELDED)
/* no need to wait here, we can wait the next
* time 'round when curr_resync == 2
*/
@ -8782,7 +8795,8 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev),
mdname(mddev2));
}
mddev_put(mddev2);
spin_unlock(&all_mddevs_lock);
if (signal_pending(current))
flush_signals(current);
schedule();
@ -8792,7 +8806,8 @@ void md_do_sync(struct md_thread *thread)
finish_wait(&resync_wait, &wq);
}
}
} while (mddev->curr_resync < 2);
spin_unlock(&all_mddevs_lock);
} while (mddev->curr_resync < MD_RESYNC_DELAYED);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@ -8876,7 +8891,7 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev));
mddev->curr_resync = j;
} else
mddev->curr_resync = 3; /* no longer delayed */
mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event();
@ -9011,14 +9026,14 @@ void md_do_sync(struct md_thread *thread)
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 3) {
mddev->curr_resync >= MD_RESYNC_ACTIVE) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 3) {
mddev->curr_resync >= MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
@ -9082,7 +9097,7 @@ void md_do_sync(struct md_thread *thread)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0;
mddev->curr_resync = MD_RESYNC_NONE;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
@ -9303,6 +9318,7 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@ -9338,6 +9354,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
}
if (mddev->sync_thread) {
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
goto unlock;
}
@ -9417,8 +9434,7 @@ void md_reap_sync_thread(struct mddev *mddev)
sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false;
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
/* sync_thread should be unregistered, collect result */
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) {
@ -9466,6 +9482,7 @@ void md_reap_sync_thread(struct mddev *mddev)
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
if (mddev->event_work.func)
@ -9544,11 +9561,14 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
struct list_head *tmp;
struct mddev *mddev;
struct mddev *mddev, *n;
int need_delay = 0;
for_each_mddev(mddev, tmp) {
spin_lock(&all_mddevs_lock);
list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
@ -9557,7 +9577,11 @@ static int md_notify_reboot(struct notifier_block *this,
mddev_unlock(mddev);
}
need_delay = 1;
mddev_put(mddev);
spin_lock(&all_mddevs_lock);
}
spin_unlock(&all_mddevs_lock);
/*
* certain more exotic SCSI devices are known to be
* volatile wrt too early system reboots. While the
@ -9876,8 +9900,7 @@ void md_autostart_arrays(int part)
static __exit void md_exit(void)
{
struct mddev *mddev;
struct list_head *tmp;
struct mddev *mddev, *n;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
@ -9897,17 +9920,24 @@ static __exit void md_exit(void)
}
remove_proc_entry("mdstat", NULL);
for_each_mddev(mddev, tmp) {
spin_lock(&all_mddevs_lock);
list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
export_array(mddev);
mddev->ctime = 0;
mddev->hold_active = 0;
/*
* for_each_mddev() will call mddev_put() at the end of each
* iteration. As the mddev is now fully clear, this will
* schedule the mddev for destruction by a workqueue, and the
* As the mddev is now fully clear, mddev_put will schedule
* the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
mddev_put(mddev);
spin_lock(&all_mddevs_lock);
}
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);

View File

@ -254,6 +254,7 @@ struct md_cluster_info;
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
* @MD_DELETED: This device is being deleted
*
* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
*/
@ -270,6 +271,7 @@ enum mddev_flags {
MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
MD_DELETED,
};
enum mddev_sb_flags {
@ -288,6 +290,21 @@ struct serial_info {
sector_t _subtree_last; /* highest sector in subtree of rb node */
};
/*
* mddev->curr_resync stores the current sector of the resync but
* also has some overloaded values.
*/
enum {
/* No resync in progress */
MD_RESYNC_NONE = 0,
/* Yielded to allow another conflicting resync to commence */
MD_RESYNC_YIELDED = 1,
/* Delayed to check that there is no conflict with another sync */
MD_RESYNC_DELAYED = 2,
/* Any value greater than or equal to this is in an active resync */
MD_RESYNC_ACTIVE = 3,
};
struct mddev {
void *private;
struct md_personality *pers;
@ -750,6 +767,8 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void mddev_init(struct mddev *mddev);
struct mddev *md_alloc(dev_t dev, char *name);
void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
extern int md_start(struct mddev *mddev);
extern void md_stop(struct mddev *mddev);

View File

@ -2167,9 +2167,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
int err = 0;
int number = rdev->raid_disk;
struct md_rdev **rdevp;
struct raid10_info *p = conf->mirrors + number;
struct raid10_info *p;
print_conf(conf);
if (unlikely(number >= mddev->raid_disks))
return 0;
p = conf->mirrors + number;
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)

View File

@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
struct r5l_log *log;
bool ret;
/* don't allow write if journal disk is missing */
rcu_read_lock();
log = rcu_dereference(conf->log);
struct r5l_log *log = conf->log;
/* don't allow write if journal disk is missing */
if (!log)
ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
else
ret = test_bit(Faulty, &log->rdev->flags);
rcu_read_unlock();
return ret;
return test_bit(Faulty, &log->rdev->flags);
}
#define R5L_RECOVERY_PAGE_POOL_SIZE 256
@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
struct r5conf *conf;
int ret;
spin_lock(&mddev->lock);
ret = mddev_lock(mddev);
if (ret)
return ret;
conf = mddev->private;
if (!conf || !conf->log) {
spin_unlock(&mddev->lock);
return 0;
}
if (!conf || !conf->log)
goto out_unlock;
switch (conf->log->r5c_journal_mode) {
case R5C_JOURNAL_MODE_WRITE_THROUGH:
@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
default:
ret = 0;
}
spin_unlock(&mddev->lock);
out_unlock:
mddev_unlock(mddev);
return ret;
}
@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf,
int i;
struct r5dev *dev;
int to_cache = 0;
void **pslot;
void __rcu **pslot;
sector_t tree_index;
int ret;
uintptr_t refcount;
@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
int i;
int do_wakeup = 0;
sector_t tree_index;
void **pslot;
void __rcu **pslot;
uintptr_t refcount;
if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
rcu_assign_pointer(conf->log, log);
conf->log = log;
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
conf->log = NULL;
synchronize_rcu();
/* Ensure disable_writeback_work wakes up and exits */
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
conf->log = NULL;
mempool_exit(&log->meta_pool);
bioset_exit(&log->bs);
mempool_exit(&log->io_pool);

View File

@ -2,49 +2,46 @@
#ifndef _RAID5_LOG_H
#define _RAID5_LOG_H
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5conf *conf);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int quiesce);
extern bool r5l_log_disk_error(struct r5conf *conf);
extern bool r5c_is_writeback(struct r5l_log *log);
extern int
r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks);
extern void
r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s);
extern void r5c_release_extra_page(struct stripe_head *sh);
extern void r5c_use_extra_page(struct stripe_head *sh);
extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
extern void r5c_handle_cached_data_endio(struct r5conf *conf,
struct stripe_head *sh, int disks);
extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
extern void r5c_make_stripe_write_out(struct stripe_head *sh);
extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
void r5l_exit_log(struct r5conf *conf);
int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
void r5l_write_stripe_run(struct r5l_log *log);
void r5l_flush_stripe_to_raid(struct r5l_log *log);
void r5l_stripe_write_finished(struct stripe_head *sh);
int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
void r5l_quiesce(struct r5l_log *log, int quiesce);
bool r5l_log_disk_error(struct r5conf *conf);
bool r5c_is_writeback(struct r5l_log *log);
int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks);
void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s);
void r5c_release_extra_page(struct stripe_head *sh);
void r5c_use_extra_page(struct stripe_head *sh);
void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
void r5c_handle_cached_data_endio(struct r5conf *conf,
struct stripe_head *sh, int disks);
int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
void r5c_make_stripe_write_out(struct stripe_head *sh);
void r5c_flush_cache(struct r5conf *conf, int num);
void r5c_check_stripe_cache_usage(struct r5conf *conf);
void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev,
struct md_rdev *rdev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
extern int r5l_start(struct r5l_log *log);
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev);
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
int r5l_start(struct r5l_log *log);
extern struct dma_async_tx_descriptor *
struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx);
extern int ppl_init_log(struct r5conf *conf);
extern void ppl_exit_log(struct r5conf *conf);
extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
extern void ppl_write_stripe_run(struct r5conf *conf);
extern void ppl_stripe_write_finished(struct stripe_head *sh);
extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
extern void ppl_quiesce(struct r5conf *conf, int quiesce);
extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
int ppl_init_log(struct r5conf *conf);
void ppl_exit_log(struct r5conf *conf);
int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
void ppl_write_stripe_run(struct r5conf *conf);
void ppl_stripe_write_finished(struct stripe_head *sh);
int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
void ppl_quiesce(struct r5conf *conf, int quiesce);
int ppl_handle_flush_request(struct bio *bio);
extern struct md_sysfs_entry ppl_write_hint;
static inline bool raid5_has_log(struct r5conf *conf)
@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
if (conf->log)
ret = r5l_handle_flush_request(conf->log, bio);
else if (raid5_has_ppl(conf))
ret = ppl_handle_flush_request(conf->log, bio);
ret = ppl_handle_flush_request(bio);
return ret;
}

View File

@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
}
}
int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
int ppl_handle_flush_request(struct bio *bio)
{
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);

View File

@ -61,6 +61,8 @@
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
#define RAID5_MAX_REQ_STRIPES 256
static bool devices_handle_discard_safely = false;
module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(devices_handle_discard_safely,
@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
return NULL;
}
static struct stripe_head *find_get_stripe(struct r5conf *conf,
sector_t sector, short generation, int hash)
{
int inc_empty_inactive_list_flag;
struct stripe_head *sh;
sh = __find_stripe(conf, sector, generation);
if (!sh)
return NULL;
if (atomic_inc_not_zero(&sh->count))
return sh;
/*
* Slow path. The reference count is zero which means the stripe must
* be on a list (sh->lru). Must remove the stripe from the list that
* references it with the device_lock held.
*/
spin_lock(&conf->device_lock);
if (!atomic_read(&sh->count)) {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&sh->lru) &&
!test_bit(STRIPE_EXPANDING, &sh->state));
inc_empty_inactive_list_flag = 0;
if (!list_empty(conf->inactive_list + hash))
inc_empty_inactive_list_flag = 1;
list_del_init(&sh->lru);
if (list_empty(conf->inactive_list + hash) &&
inc_empty_inactive_list_flag)
atomic_inc(&conf->empty_inactive_list_nr);
if (sh->group) {
sh->group->stripes_cnt--;
sh->group = NULL;
}
}
atomic_inc(&sh->count);
spin_unlock(&conf->device_lock);
return sh;
}
/*
* Need to check if array has failed when deciding whether to:
* - start an array
@ -710,80 +755,121 @@ static bool has_failed(struct r5conf *conf)
return degraded > conf->max_degraded;
}
struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
enum stripe_result {
STRIPE_SUCCESS = 0,
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
};
struct stripe_request_ctx {
/* a reference to the last stripe_head for batching */
struct stripe_head *batch_last;
/* first sector in the request */
sector_t first_sector;
/* last sector in the request */
sector_t last_sector;
/*
* bitmap to track stripe sectors that have been added to stripes
* add one to account for unaligned requests
*/
DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
/* the request had REQ_PREFLUSH, cleared after the first stripe_head */
bool do_flush;
};
/*
* Block until another thread clears R5_INACTIVE_BLOCKED or
* there are fewer than 3/4 the maximum number of active stripes
* and there is an inactive stripe available.
*/
static bool is_inactive_blocked(struct r5conf *conf, int hash)
{
int active = atomic_read(&conf->active_stripes);
if (list_empty(conf->inactive_list + hash))
return false;
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
return true;
return active < (conf->max_nr_stripes * 3 / 4);
}
static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf,
struct stripe_request_ctx *ctx, sector_t sector,
bool previous, bool noblock, bool noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(conf, sector);
int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
spin_lock_irq(conf->hash_locks + hash);
do {
wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash);
if (!sh && !test_bit(R5_DID_ALLOC,
&conf->cache_state))
set_bit(R5_ALLOC_MORE,
&conf->cache_state);
}
if (noblock && sh == NULL)
break;
r5c_check_stripe_cache_usage(conf);
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(
conf->wait_for_stripe,
!list_empty(conf->inactive_list + hash) &&
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes * 3 / 4)
|| !test_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state)),
*(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
} else {
init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
}
} else if (!atomic_inc_not_zero(&sh->count)) {
spin_lock(&conf->device_lock);
if (!atomic_read(&sh->count)) {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&sh->lru) &&
!test_bit(STRIPE_EXPANDING, &sh->state));
inc_empty_inactive_list_flag = 0;
if (!list_empty(conf->inactive_list + hash))
inc_empty_inactive_list_flag = 1;
list_del_init(&sh->lru);
if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
atomic_inc(&conf->empty_inactive_list_nr);
if (sh->group) {
sh->group->stripes_cnt--;
sh->group = NULL;
}
}
atomic_inc(&sh->count);
spin_unlock(&conf->device_lock);
retry:
if (!noquiesce && conf->quiesce) {
/*
* Must release the reference to batch_last before waiting,
* on quiesce, otherwise the batch_last will hold a reference
* to a stripe and raid5_quiesce() will deadlock waiting for
* active_stripes to go to zero.
*/
if (ctx && ctx->batch_last) {
raid5_release_stripe(ctx->batch_last);
ctx->batch_last = NULL;
}
} while (sh == NULL);
wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce,
*(conf->hash_locks + hash));
}
sh = find_get_stripe(conf, sector, conf->generation - previous, hash);
if (sh)
goto out;
if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
goto wait_for_stripe;
sh = get_free_stripe(conf, hash);
if (sh) {
r5c_check_stripe_cache_usage(conf);
init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
goto out;
}
if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
set_bit(R5_ALLOC_MORE, &conf->cache_state);
wait_for_stripe:
if (noblock)
goto out;
set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(conf->wait_for_stripe,
is_inactive_blocked(conf, hash),
*(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
goto retry;
out:
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
sector_t sector, bool previous, bool noblock, bool noquiesce)
{
return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock,
noquiesce);
}
static bool is_full_stripe_write(struct stripe_head *sh)
{
BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
@ -824,13 +910,13 @@ static bool stripe_can_batch(struct stripe_head *sh)
}
/* we only do back search */
static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
static void stripe_add_to_batch_list(struct r5conf *conf,
struct stripe_head *sh, struct stripe_head *last_sh)
{
struct stripe_head *head;
sector_t head_sector, tmp_sec;
int hash;
int dd_idx;
int inc_empty_inactive_list_flag;
/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
tmp_sec = sh->sector;
@ -838,36 +924,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
return;
head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
if (head && !atomic_inc_not_zero(&head->count)) {
spin_lock(&conf->device_lock);
if (!atomic_read(&head->count)) {
if (!test_bit(STRIPE_HANDLE, &head->state))
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&head->lru) &&
!test_bit(STRIPE_EXPANDING, &head->state));
inc_empty_inactive_list_flag = 0;
if (!list_empty(conf->inactive_list + hash))
inc_empty_inactive_list_flag = 1;
list_del_init(&head->lru);
if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
atomic_inc(&conf->empty_inactive_list_nr);
if (head->group) {
head->group->stripes_cnt--;
head->group = NULL;
}
}
if (last_sh && head_sector == last_sh->sector) {
head = last_sh;
atomic_inc(&head->count);
spin_unlock(&conf->device_lock);
} else {
hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = find_get_stripe(conf, head_sector, conf->generation,
hash);
spin_unlock_irq(conf->hash_locks + hash);
if (!head)
return;
if (!stripe_can_batch(head))
goto out;
}
spin_unlock_irq(conf->hash_locks + hash);
if (!head)
return;
if (!stripe_can_batch(head))
goto out;
lock_two_stripes(head, sh);
/* clear_batch_ready clear the flag */
@ -2882,10 +2952,10 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
raid5_release_stripe(sh->batch_head);
raid5_release_stripe(sh);
}
static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
@ -3413,39 +3483,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked, s->ops_request);
}
/*
* Each stripe/dev can have one or more bion attached.
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
int forwrite, int previous)
static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
int dd_idx, int forwrite)
{
struct bio **bip;
struct r5conf *conf = sh->raid_conf;
int firstwrite=0;
struct bio **bip;
pr_debug("adding bi b#%llu to stripe s#%llu\n",
(unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
pr_debug("checking bi b#%llu to stripe s#%llu\n",
bi->bi_iter.bi_sector, sh->sector);
spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
goto overlap;
if (forwrite) {
return true;
if (forwrite)
bip = &sh->dev[dd_idx].towrite;
if (*bip == NULL)
firstwrite = 1;
} else
else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
goto overlap;
bip = & (*bip)->bi_next;
return true;
bip = &(*bip)->bi_next;
}
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
return true;
if (forwrite && raid5_has_ppl(conf)) {
/*
@ -3474,9 +3537,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
}
if (first + conf->chunk_sectors * (count - 1) != last)
goto overlap;
return true;
}
return false;
}
static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
int dd_idx, int forwrite, int previous)
{
struct r5conf *conf = sh->raid_conf;
struct bio **bip;
int firstwrite = 0;
if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
if (!*bip)
firstwrite = 1;
} else {
bip = &sh->dev[dd_idx].toread;
}
while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
bip = &(*bip)->bi_next;
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@ -3502,9 +3586,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
sh->overwrite_disks++;
}
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_iter.bi_sector,
(unsigned long long)sh->sector, dd_idx);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
sh->dev[dd_idx].sector);
if (conf->mddev->bitmap && firstwrite) {
/* Cannot hold spinlock over bitmap_startwrite,
@ -3512,7 +3596,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
* we have added to the bitmap and set bm_seq.
* So set STRIPE_BITMAP_PENDING to prevent
* batching.
* If multiple add_stripe_bio() calls race here they
* If multiple __add_stripe_bio() calls race here they
* much all set STRIPE_BITMAP_PENDING. So only the first one
* to complete "bitmap_startwrite" gets to set
* STRIPE_BIT_DELAY. This is important as once a stripe
@ -3530,16 +3614,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
spin_unlock_irq(&sh->stripe_lock);
}
if (stripe_can_batch(sh))
stripe_add_to_batch_list(conf, sh);
return 1;
/*
* Each stripe/dev can have one or more bios attached.
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
int dd_idx, int forwrite, int previous)
{
spin_lock_irq(&sh->stripe_lock);
overlap:
set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
spin_unlock_irq(&sh->stripe_lock);
return false;
}
__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
spin_unlock_irq(&sh->stripe_lock);
return 0;
return true;
}
static void end_reshape(struct r5conf *conf);
@ -5785,17 +5880,215 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
bio_endio(bi);
}
static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
sector_t reshape_sector)
{
return mddev->reshape_backwards ? sector < reshape_sector :
sector >= reshape_sector;
}
static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
sector_t max, sector_t reshape_sector)
{
return mddev->reshape_backwards ? max < reshape_sector :
min >= reshape_sector;
}
static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
struct stripe_head *sh)
{
sector_t max_sector = 0, min_sector = MaxSector;
bool ret = false;
int dd_idx;
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
if (dd_idx == sh->pd_idx)
continue;
min_sector = min(min_sector, sh->dev[dd_idx].sector);
max_sector = min(max_sector, sh->dev[dd_idx].sector);
}
spin_lock_irq(&conf->device_lock);
if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
conf->reshape_progress))
/* mismatch, need to try again */
ret = true;
spin_unlock_irq(&conf->device_lock);
return ret;
}
static int add_all_stripe_bios(struct r5conf *conf,
struct stripe_request_ctx *ctx, struct stripe_head *sh,
struct bio *bi, int forwrite, int previous)
{
int dd_idx;
int ret = 1;
spin_lock_irq(&sh->stripe_lock);
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
continue;
if (dev->sector < ctx->first_sector ||
dev->sector >= ctx->last_sector)
continue;
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &dev->flags);
ret = 0;
continue;
}
}
if (!ret)
goto out;
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
continue;
if (dev->sector < ctx->first_sector ||
dev->sector >= ctx->last_sector)
continue;
__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
clear_bit((dev->sector - ctx->first_sector) >>
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
}
out:
spin_unlock_irq(&sh->stripe_lock);
return ret;
}
static enum stripe_result make_stripe_request(struct mddev *mddev,
struct r5conf *conf, struct stripe_request_ctx *ctx,
sector_t logical_sector, struct bio *bi)
{
const int rw = bio_data_dir(bi);
enum stripe_result ret;
struct stripe_head *sh;
sector_t new_sector;
int previous = 0;
int seq, dd_idx;
seq = read_seqcount_begin(&conf->gen_lock);
if (unlikely(conf->reshape_progress != MaxSector)) {
/*
* Spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
* possible to see a half-updated value
* Of course reshape_progress could change after
* the lock is dropped, so once we get a reference
* to the stripe that we think it is, we will have
* to check again.
*/
spin_lock_irq(&conf->device_lock);
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_progress)) {
previous = 1;
} else {
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
return STRIPE_SCHEDULE_AND_RETRY;
}
}
spin_unlock_irq(&conf->device_lock);
}
new_sector = raid5_compute_sector(conf, logical_sector, previous,
&dd_idx, NULL);
pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
new_sector, logical_sector);
sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous,
(bi->bi_opf & REQ_RAHEAD), 0);
if (unlikely(!sh)) {
/* cannot get stripe, just give-up */
bi->bi_status = BLK_STS_IOERR;
return STRIPE_FAIL;
}
if (unlikely(previous) &&
stripe_ahead_of_reshape(mddev, conf, sh)) {
/*
* Expansion moved on while waiting for a stripe.
* Expansion could still move past after this
* test, but as we are holding a reference to
* 'sh', we know that if that happens,
* STRIPE_EXPANDING will get set and the expansion
* won't proceed until we finish with the stripe.
*/
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
if (read_seqcount_retry(&conf->gen_lock, seq)) {
/* Might have got the wrong stripe_head by accident */
ret = STRIPE_RETRY;
goto out_release;
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
/*
* Stripe is busy expanding or add failed due to
* overlap. Flush everything and wait a while.
*/
md_wakeup_thread(mddev->thread);
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
if (stripe_can_batch(sh)) {
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
if (ctx->batch_last)
raid5_release_stripe(ctx->batch_last);
atomic_inc(&sh->count);
ctx->batch_last = sh;
}
if (ctx->do_flush) {
set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
/* we only need flush for one stripe */
ctx->do_flush = false;
}
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe_plug(mddev, sh);
return STRIPE_SUCCESS;
out_release:
raid5_release_stripe(sh);
return ret;
}
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct r5conf *conf = mddev->private;
int dd_idx;
sector_t new_sector;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
sector_t logical_sector;
struct stripe_request_ctx ctx = {};
const int rw = bio_data_dir(bi);
DEFINE_WAIT(w);
bool do_prepare;
bool do_flush = false;
enum stripe_result res;
int s, stripe_cnt;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = log_handle_flush_request(conf, bi);
@ -5811,7 +6104,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
* we need to flush journal device
*/
do_flush = bi->bi_opf & REQ_PREFLUSH;
ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
}
if (!md_write_start(mddev, bi))
@ -5835,134 +6128,68 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
ctx.first_sector = logical_sector;
ctx.last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
RAID5_STRIPE_SECTORS(conf));
bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
bi->bi_iter.bi_sector, ctx.last_sector);
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
(conf->reshape_progress != MaxSector) &&
(mddev->reshape_backwards
? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
: (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
!ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
bio_wouldblock_error(bi);
if (rw == WRITE)
md_write_end(mddev);
return true;
}
md_account_bio(mddev, &bi);
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
int previous;
int seq;
do_prepare = false;
retry:
seq = read_seqcount_begin(&conf->gen_lock);
previous = 0;
if (do_prepare)
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
if (unlikely(conf->reshape_progress != MaxSector)) {
/* spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
* possible to see a half-updated value
* Of course reshape_progress could change after
* the lock is dropped, so once we get a reference
* to the stripe that we think it is, we will have
* to check again.
*/
spin_lock_irq(&conf->device_lock);
if (mddev->reshape_backwards
? logical_sector < conf->reshape_progress
: logical_sector >= conf->reshape_progress) {
previous = 1;
} else {
if (mddev->reshape_backwards
? logical_sector < conf->reshape_safe
: logical_sector >= conf->reshape_safe) {
spin_unlock_irq(&conf->device_lock);
schedule();
do_prepare = true;
goto retry;
}
}
spin_unlock_irq(&conf->device_lock);
}
new_sector = raid5_compute_sector(conf, logical_sector,
previous,
&dd_idx, NULL);
pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_opf & REQ_RAHEAD), 0);
if (sh) {
if (unlikely(previous)) {
/* expansion might have moved on while waiting for a
* stripe, so we must do the range check again.
* Expansion could still move past after this
* test, but as we are holding a reference to
* 'sh', we know that if that happens,
* STRIPE_EXPANDING will get set and the expansion
* won't proceed until we finish with the stripe.
*/
int must_retry = 0;
spin_lock_irq(&conf->device_lock);
if (mddev->reshape_backwards
? logical_sector >= conf->reshape_progress
: logical_sector < conf->reshape_progress)
/* mismatch, need to try again */
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
}
}
if (read_seqcount_retry(&conf->gen_lock, seq)) {
/* Might have got the wrong stripe_head
* by accident
*/
raid5_release_stripe(sh);
goto retry;
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
!add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
/* Stripe is busy expanding or
* add failed due to overlap. Flush everything
* and wait a while
*/
md_wakeup_thread(mddev->thread);
raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
}
if (do_flush) {
set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
/* we only need flush for one stripe */
do_flush = false;
}
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
bi->bi_status = BLK_STS_IOERR;
add_wait_queue(&conf->wait_for_overlap, &wait);
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
if (res == STRIPE_FAIL)
break;
if (res == STRIPE_RETRY)
continue;
if (res == STRIPE_SCHEDULE_AND_RETRY) {
/*
* Must release the reference to batch_last before
* scheduling and waiting for work to be done,
* otherwise the batch_last stripe head could prevent
* raid5_activate_delayed() from making progress
* and thus deadlocking.
*/
if (ctx.batch_last) {
raid5_release_stripe(ctx.batch_last);
ctx.batch_last = NULL;
}
wait_woken(&wait, TASK_UNINTERRUPTIBLE,
MAX_SCHEDULE_TIMEOUT);
continue;
}
s = find_first_bit(ctx.sectors_to_do, stripe_cnt);
if (s == stripe_cnt)
break;
logical_sector = ctx.first_sector +
(s << RAID5_STRIPE_SHIFT(conf));
}
finish_wait(&conf->wait_for_overlap, &w);
remove_wait_queue(&conf->wait_for_overlap, &wait);
if (ctx.batch_last)
raid5_release_stripe(ctx.batch_last);
if (rw == WRITE)
md_write_end(mddev);
@ -7815,7 +8042,15 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_granularity < stripe)
blk_queue_max_discard_sectors(mddev->queue, 0);
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
blk_queue_max_hw_sectors(mddev->queue,
RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
/* No restrictions on the number of segments in the request */
blk_queue_max_segments(mddev->queue, USHRT_MAX);
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@ -8066,8 +8301,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
if (rdev->saved_raid_disk >= first &&
rdev->saved_raid_disk <= last &&
conf->disks[rdev->saved_raid_disk].rdev == NULL)
first = rdev->saved_raid_disk;
@ -8704,8 +8938,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
if (err)
if (err) {
mddev_suspend(mddev);
log_exit(conf);
mddev_resume(mddev);
}
}
} else
err = -EINVAL;

View File

@ -812,7 +812,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
struct stripe_head *sh);
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
bool previous, bool noblock, bool noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
#endif

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
menu "NVME Support"
source "drivers/nvme/common/Kconfig"
source "drivers/nvme/host/Kconfig"
source "drivers/nvme/target/Kconfig"

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_NVME_COMMON) += common/
obj-y += host/
obj-y += target/

View File

@ -0,0 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
config NVME_COMMON
tristate

View File

@ -0,0 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
ccflags-y += -I$(src)
obj-$(CONFIG_NVME_COMMON) += nvme-common.o
nvme-common-y += auth.o

483
drivers/nvme/common/auth.c Normal file
View File

@ -0,0 +1,483 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2020 Hannes Reinecke, SUSE Linux
*/
#include <linux/module.h>
#include <linux/crc32.h>
#include <linux/base64.h>
#include <linux/prandom.h>
#include <linux/scatterlist.h>
#include <asm/unaligned.h>
#include <crypto/hash.h>
#include <crypto/dh.h>
#include <linux/nvme.h>
#include <linux/nvme-auth.h>
static u32 nvme_dhchap_seqnum;
static DEFINE_MUTEX(nvme_dhchap_mutex);
u32 nvme_auth_get_seqnum(void)
{
u32 seqnum;
mutex_lock(&nvme_dhchap_mutex);
if (!nvme_dhchap_seqnum)
nvme_dhchap_seqnum = prandom_u32();
else {
nvme_dhchap_seqnum++;
if (!nvme_dhchap_seqnum)
nvme_dhchap_seqnum++;
}
seqnum = nvme_dhchap_seqnum;
mutex_unlock(&nvme_dhchap_mutex);
return seqnum;
}
EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);
static struct nvme_auth_dhgroup_map {
const char name[16];
const char kpp[16];
} dhgroup_map[] = {
[NVME_AUTH_DHGROUP_NULL] = {
.name = "null", .kpp = "null" },
[NVME_AUTH_DHGROUP_2048] = {
.name = "ffdhe2048", .kpp = "ffdhe2048(dh)" },
[NVME_AUTH_DHGROUP_3072] = {
.name = "ffdhe3072", .kpp = "ffdhe3072(dh)" },
[NVME_AUTH_DHGROUP_4096] = {
.name = "ffdhe4096", .kpp = "ffdhe4096(dh)" },
[NVME_AUTH_DHGROUP_6144] = {
.name = "ffdhe6144", .kpp = "ffdhe6144(dh)" },
[NVME_AUTH_DHGROUP_8192] = {
.name = "ffdhe8192", .kpp = "ffdhe8192(dh)" },
};
const char *nvme_auth_dhgroup_name(u8 dhgroup_id)
{
if (dhgroup_id >= ARRAY_SIZE(dhgroup_map))
return NULL;
return dhgroup_map[dhgroup_id].name;
}
EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name);
const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id)
{
if (dhgroup_id >= ARRAY_SIZE(dhgroup_map))
return NULL;
return dhgroup_map[dhgroup_id].kpp;
}
EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp);
u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
{
int i;
if (!dhgroup_name || !strlen(dhgroup_name))
return NVME_AUTH_DHGROUP_INVALID;
for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) {
if (!strlen(dhgroup_map[i].name))
continue;
if (!strncmp(dhgroup_map[i].name, dhgroup_name,
strlen(dhgroup_map[i].name)))
return i;
}
return NVME_AUTH_DHGROUP_INVALID;
}
EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);
static struct nvme_dhchap_hash_map {
int len;
const char hmac[15];
const char digest[8];
} hash_map[] = {
[NVME_AUTH_HASH_SHA256] = {
.len = 32,
.hmac = "hmac(sha256)",
.digest = "sha256",
},
[NVME_AUTH_HASH_SHA384] = {
.len = 48,
.hmac = "hmac(sha384)",
.digest = "sha384",
},
[NVME_AUTH_HASH_SHA512] = {
.len = 64,
.hmac = "hmac(sha512)",
.digest = "sha512",
},
};
const char *nvme_auth_hmac_name(u8 hmac_id)
{
if (hmac_id >= ARRAY_SIZE(hash_map))
return NULL;
return hash_map[hmac_id].hmac;
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);
const char *nvme_auth_digest_name(u8 hmac_id)
{
if (hmac_id >= ARRAY_SIZE(hash_map))
return NULL;
return hash_map[hmac_id].digest;
}
EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
u8 nvme_auth_hmac_id(const char *hmac_name)
{
int i;
if (!hmac_name || !strlen(hmac_name))
return NVME_AUTH_HASH_INVALID;
for (i = 0; i < ARRAY_SIZE(hash_map); i++) {
if (!strlen(hash_map[i].hmac))
continue;
if (!strncmp(hash_map[i].hmac, hmac_name,
strlen(hash_map[i].hmac)))
return i;
}
return NVME_AUTH_HASH_INVALID;
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_id);
size_t nvme_auth_hmac_hash_len(u8 hmac_id)
{
if (hmac_id >= ARRAY_SIZE(hash_map))
return 0;
return hash_map[hmac_id].len;
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
u8 key_hash)
{
struct nvme_dhchap_key *key;
unsigned char *p;
u32 crc;
int ret, key_len;
size_t allocated_len = strlen(secret);
/* Secret might be affixed with a ':' */
p = strrchr(secret, ':');
if (p)
allocated_len = p - secret;
key = kzalloc(sizeof(*key), GFP_KERNEL);
if (!key)
return ERR_PTR(-ENOMEM);
key->key = kzalloc(allocated_len, GFP_KERNEL);
if (!key->key) {
ret = -ENOMEM;
goto out_free_key;
}
key_len = base64_decode(secret, allocated_len, key->key);
if (key_len < 0) {
pr_debug("base64 key decoding error %d\n",
key_len);
ret = key_len;
goto out_free_secret;
}
if (key_len != 36 && key_len != 52 &&
key_len != 68) {
pr_err("Invalid key len %d\n", key_len);
ret = -EINVAL;
goto out_free_secret;
}
if (key_hash > 0 &&
(key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) {
pr_err("Mismatched key len %d for %s\n", key_len,
nvme_auth_hmac_name(key_hash));
ret = -EINVAL;
goto out_free_secret;
}
/* The last four bytes is the CRC in little-endian format */
key_len -= 4;
/*
* The linux implementation doesn't do pre- and post-increments,
* so we have to do it manually.
*/
crc = ~crc32(~0, key->key, key_len);
if (get_unaligned_le32(key->key + key_len) != crc) {
pr_err("key crc mismatch (key %08x, crc %08x)\n",
get_unaligned_le32(key->key + key_len), crc);
ret = -EKEYREJECTED;
goto out_free_secret;
}
key->len = key_len;
key->hash = key_hash;
return key;
out_free_secret:
kfree_sensitive(key->key);
out_free_key:
kfree(key);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
void nvme_auth_free_key(struct nvme_dhchap_key *key)
{
if (!key)
return;
kfree_sensitive(key->key);
kfree(key);
}
EXPORT_SYMBOL_GPL(nvme_auth_free_key);
u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
{
const char *hmac_name;
struct crypto_shash *key_tfm;
struct shash_desc *shash;
u8 *transformed_key;
int ret;
if (!key || !key->key) {
pr_warn("No key specified\n");
return ERR_PTR(-ENOKEY);
}
if (key->hash == 0) {
transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
}
hmac_name = nvme_auth_hmac_name(key->hash);
if (!hmac_name) {
pr_warn("Invalid key hash id %d\n", key->hash);
return ERR_PTR(-EINVAL);
}
key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(key_tfm))
return (u8 *)key_tfm;
shash = kmalloc(sizeof(struct shash_desc) +
crypto_shash_descsize(key_tfm),
GFP_KERNEL);
if (!shash) {
ret = -ENOMEM;
goto out_free_key;
}
transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
if (!transformed_key) {
ret = -ENOMEM;
goto out_free_shash;
}
shash->tfm = key_tfm;
ret = crypto_shash_setkey(key_tfm, key->key, key->len);
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_init(shash);
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_update(shash, nqn, strlen(nqn));
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_final(shash, transformed_key);
if (ret < 0)
goto out_free_transformed_key;
kfree(shash);
crypto_free_shash(key_tfm);
return transformed_key;
out_free_transformed_key:
kfree_sensitive(transformed_key);
out_free_shash:
kfree(shash);
out_free_key:
crypto_free_shash(key_tfm);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
{
const char *digest_name;
struct crypto_shash *tfm;
int ret;
digest_name = nvme_auth_digest_name(hmac_id);
if (!digest_name) {
pr_debug("%s: failed to get digest for %d\n", __func__,
hmac_id);
return -EINVAL;
}
tfm = crypto_alloc_shash(digest_name, 0, 0);
if (IS_ERR(tfm))
return -ENOMEM;
ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
if (ret < 0)
pr_debug("%s: Failed to hash digest len %zu\n", __func__,
skey_len);
crypto_free_shash(tfm);
return ret;
}
int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
u8 *challenge, u8 *aug, size_t hlen)
{
struct crypto_shash *tfm;
struct shash_desc *desc;
u8 *hashed_key;
const char *hmac_name;
int ret;
hashed_key = kmalloc(hlen, GFP_KERNEL);
if (!hashed_key)
return -ENOMEM;
ret = nvme_auth_hash_skey(hmac_id, skey,
skey_len, hashed_key);
if (ret < 0)
goto out_free_key;
hmac_name = nvme_auth_hmac_name(hmac_id);
if (!hmac_name) {
pr_warn("%s: invalid hash algorithm %d\n",
__func__, hmac_id);
ret = -EINVAL;
goto out_free_key;
}
tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(tfm)) {
ret = PTR_ERR(tfm);
goto out_free_key;
}
desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
GFP_KERNEL);
if (!desc) {
ret = -ENOMEM;
goto out_free_hash;
}
desc->tfm = tfm;
ret = crypto_shash_setkey(tfm, hashed_key, hlen);
if (ret)
goto out_free_desc;
ret = crypto_shash_init(desc);
if (ret)
goto out_free_desc;
ret = crypto_shash_update(desc, challenge, hlen);
if (ret)
goto out_free_desc;
ret = crypto_shash_final(desc, aug);
out_free_desc:
kfree_sensitive(desc);
out_free_hash:
crypto_free_shash(tfm);
out_free_key:
kfree_sensitive(hashed_key);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid)
{
int ret;
ret = crypto_kpp_set_secret(dh_tfm, NULL, 0);
if (ret)
pr_debug("failed to set private key, error %d\n", ret);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey);
int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
u8 *host_key, size_t host_key_len)
{
struct kpp_request *req;
struct crypto_wait wait;
struct scatterlist dst;
int ret;
req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
if (!req)
return -ENOMEM;
crypto_init_wait(&wait);
kpp_request_set_input(req, NULL, 0);
sg_init_one(&dst, host_key, host_key_len);
kpp_request_set_output(req, &dst, host_key_len);
kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &wait);
ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait);
kpp_request_free(req);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);
int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
u8 *ctrl_key, size_t ctrl_key_len,
u8 *sess_key, size_t sess_key_len)
{
struct kpp_request *req;
struct crypto_wait wait;
struct scatterlist src, dst;
int ret;
req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
if (!req)
return -ENOMEM;
crypto_init_wait(&wait);
sg_init_one(&src, ctrl_key, ctrl_key_len);
kpp_request_set_input(req, &src, ctrl_key_len);
sg_init_one(&dst, sess_key, sess_key_len);
kpp_request_set_output(req, &dst, sess_key_len);
kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &wait);
ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait);
kpp_request_free(req);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);
int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
{
struct nvme_dhchap_key *key;
u8 key_hash;
if (!secret) {
*ret_key = NULL;
return 0;
}
if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1)
return -EINVAL;
/* Pass in the secret without the 'DHHC-1:XX:' prefix */
key = nvme_auth_extract_key(secret + 10, key_hash);
if (IS_ERR(key)) {
*ret_key = NULL;
return PTR_ERR(key);
}
*ret_key = key;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
MODULE_LICENSE("GPL v2");

View File

@ -92,6 +92,21 @@ config NVME_TCP
If unsure, say N.
config NVME_AUTH
bool "NVM Express over Fabrics In-Band Authentication"
depends on NVME_CORE
select NVME_COMMON
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_DH
select CRYPTO_DH_RFC7919_GROUPS
help
This provides support for NVMe over Fabrics In-Band Authentication.
If unsure, say N.
config NVME_APPLE
tristate "Apple ANS2 NVM Express host driver"
depends on OF && BLOCK

View File

@ -10,12 +10,14 @@ obj-$(CONFIG_NVME_FC) += nvme-fc.o
obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
obj-$(CONFIG_NVME_APPLE) += nvme-apple.o
nvme-core-y := core.o ioctl.o constants.o
nvme-core-y += core.o ioctl.o
nvme-core-$(CONFIG_NVME_VERBOSE_ERRORS) += constants.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
nvme-core-$(CONFIG_NVME_AUTH) += auth.o
nvme-y += pci.o

View File

@ -845,11 +845,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
apple_nvme_handle_cq(&anv->adminq, true);
spin_unlock_irqrestore(&anv->lock, flags);
blk_mq_tagset_busy_iter(&anv->tagset, nvme_cancel_request, &anv->ctrl);
blk_mq_tagset_busy_iter(&anv->admin_tagset, nvme_cancel_request,
&anv->ctrl);
blk_mq_tagset_wait_completed_request(&anv->tagset);
blk_mq_tagset_wait_completed_request(&anv->admin_tagset);
nvme_cancel_tagset(&anv->ctrl);
nvme_cancel_admin_tagset(&anv->ctrl);
/*
* The driver will not be starting up queues again if shutting down so
@ -1222,6 +1219,11 @@ static void apple_nvme_async_probe(void *data, async_cookie_t cookie)
nvme_put_ctrl(&anv->ctrl);
}
static void devm_apple_nvme_put_tag_set(void *data)
{
blk_mq_free_tag_set(data);
}
static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
{
int ret;
@ -1238,8 +1240,7 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
ret = blk_mq_alloc_tag_set(&anv->admin_tagset);
if (ret)
return ret;
ret = devm_add_action_or_reset(anv->dev,
(void (*)(void *))blk_mq_free_tag_set,
ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
&anv->admin_tagset);
if (ret)
return ret;
@ -1263,8 +1264,8 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv)
ret = blk_mq_alloc_tag_set(&anv->tagset);
if (ret)
return ret;
ret = devm_add_action_or_reset(
anv->dev, (void (*)(void *))blk_mq_free_tag_set, &anv->tagset);
ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
&anv->tagset);
if (ret)
return ret;
@ -1365,6 +1366,11 @@ static int apple_nvme_attach_genpd(struct apple_nvme *anv)
return 0;
}
static void devm_apple_nvme_mempool_destroy(void *data)
{
mempool_destroy(data);
}
static int apple_nvme_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@ -1462,8 +1468,8 @@ static int apple_nvme_probe(struct platform_device *pdev)
ret = -ENOMEM;
goto put_dev;
}
ret = devm_add_action_or_reset(
anv->dev, (void (*)(void *))mempool_destroy, anv->iod_mempool);
ret = devm_add_action_or_reset(anv->dev,
devm_apple_nvme_mempool_destroy, anv->iod_mempool);
if (ret)
goto put_dev;

1017
drivers/nvme/host/auth.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,6 @@
#include "nvme.h"
#ifdef CONFIG_NVME_VERBOSE_ERRORS
static const char * const nvme_ops[] = {
[nvme_cmd_flush] = "Flush",
[nvme_cmd_write] = "Write",
@ -178,6 +177,7 @@ const unsigned char *nvme_get_opcode_str(u8 opcode)
return nvme_ops[opcode];
return "Unknown";
}
EXPORT_SYMBOL_GPL(nvme_get_opcode_str);
const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
{
@ -185,4 +185,3 @@ const unsigned char *nvme_get_admin_opcode_str(u8 opcode)
return nvme_admin_ops[opcode];
return "Unknown";
}
#endif /* CONFIG_NVME_VERBOSE_ERRORS */

View File

@ -24,12 +24,22 @@
#include "nvme.h"
#include "fabrics.h"
#include <linux/nvme-auth.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#define NVME_MINORS (1U << MINORBITS)
struct nvme_ns_info {
struct nvme_ns_ids ids;
u32 nsid;
__le32 anagrpid;
bool is_shared;
bool is_readonly;
bool is_ready;
};
unsigned int admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
@ -330,6 +340,7 @@ enum nvme_disposition {
COMPLETE,
RETRY,
FAILOVER,
AUTHENTICATE,
};
static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
@ -337,6 +348,9 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
if (likely(nvme_req(req)->status == 0))
return COMPLETE;
if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
return AUTHENTICATE;
if (blk_noretry_request(req) ||
(nvme_req(req)->status & NVME_SC_DNR) ||
nvme_req(req)->retries >= nvme_max_retries)
@ -375,11 +389,13 @@ static inline void nvme_end_req(struct request *req)
void nvme_complete_rq(struct request *req)
{
struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
if (nvme_req(req)->ctrl->kas)
nvme_req(req)->ctrl->comp_seen = true;
if (ctrl->kas)
ctrl->comp_seen = true;
switch (nvme_decide_disposition(req)) {
case COMPLETE:
@ -391,6 +407,14 @@ void nvme_complete_rq(struct request *req)
case FAILOVER:
nvme_failover_req(req);
return;
case AUTHENTICATE:
#ifdef CONFIG_NVME_AUTH
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
nvme_retry_req(req);
#else
nvme_end_req(req);
#endif
return;
}
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
@ -702,7 +726,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
switch (ctrl->state) {
case NVME_CTRL_CONNECTING:
if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
(req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
return true;
break;
default:
@ -990,8 +1016,7 @@ static int nvme_execute_rq(struct request *rq, bool at_head)
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
blk_mq_req_flags_t flags)
int qid, int at_head, blk_mq_req_flags_t flags)
{
struct request *req;
int ret;
@ -1000,15 +1025,12 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
else
req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
qid ? qid - 1 : 0);
qid - 1);
if (IS_ERR(req))
return PTR_ERR(req);
nvme_init_request(req, cmd);
if (timeout)
req->timeout = timeout;
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
if (ret)
@ -1028,7 +1050,7 @@ EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
NVME_QID_ANY, 0, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@ -1329,8 +1351,8 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
}
}
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
struct nvme_ns_info *info)
{
struct nvme_command c = { };
bool csi_seen = false;
@ -1343,7 +1365,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
return 0;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(nsid);
c.identify.nsid = cpu_to_le32(info->nsid);
c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
@ -1355,7 +1377,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (status) {
dev_warn(ctrl->device,
"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
nsid, status);
info->nsid, status);
goto free_data;
}
@ -1365,7 +1387,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
if (len < 0)
break;
@ -1374,7 +1396,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (nvme_multi_css(ctrl) && !csi_seen) {
dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
nsid);
info->nsid);
status = -EINVAL;
}
@ -1384,7 +1406,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
}
static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids, struct nvme_id_ns **id)
struct nvme_id_ns **id)
{
struct nvme_command c = { };
int error;
@ -1407,20 +1429,6 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
error = NVME_SC_INVALID_NS | NVME_SC_DNR;
if ((*id)->ncap == 0) /* namespace not allocated or attached */
goto out_free_id;
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
} else {
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0) &&
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
}
return 0;
out_free_id:
@ -1428,30 +1436,59 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
return error;
}
static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns_cs_indep **id)
static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
struct nvme_ns_info *info)
{
struct nvme_ns_ids *ids = &info->ids;
struct nvme_id_ns *id;
int ret;
ret = nvme_identify_ns(ctrl, info->nsid, &id);
if (ret)
return ret;
info->anagrpid = id->anagrpid;
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
} else {
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0) &&
!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
}
kfree(id);
return 0;
}
static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
struct nvme_ns_info *info)
{
struct nvme_id_ns_cs_indep *id;
struct nvme_command c = {
.identify.opcode = nvme_admin_identify,
.identify.nsid = cpu_to_le32(nsid),
.identify.nsid = cpu_to_le32(info->nsid),
.identify.cns = NVME_ID_CNS_NS_CS_INDEP,
};
int ret;
*id = kmalloc(sizeof(**id), GFP_KERNEL);
if (!*id)
id = kmalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return -ENOMEM;
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
if (ret) {
dev_warn(ctrl->device,
"Identify namespace (CS independent) failed (%d)\n",
ret);
kfree(*id);
return ret;
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
if (!ret) {
info->anagrpid = id->anagrpid;
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
}
return 0;
kfree(id);
return ret;
}
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
@ -1466,7 +1503,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
buffer, buflen, 0, NVME_QID_ANY, 0, 0);
buffer, buflen, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
@ -1875,6 +1912,11 @@ static void nvme_update_disk_info(struct gendisk *disk,
ns->ctrl->max_zeroes_sectors);
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
{
return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
}
static inline bool nvme_first_scan(struct gendisk *disk)
{
/* nvme_alloc_ns() scans the disk prior to adding it */
@ -1912,12 +1954,44 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
blk_queue_chunk_sectors(ns->queue, iob);
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
unsigned lbaf = nvme_lbaf_index(id->flbas);
blk_mq_freeze_queue(ns->disk->queue);
nvme_set_queue_limits(ns->ctrl, ns->queue);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_mq_unfreeze_queue(ns->disk->queue);
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
ns->head->disk->flags |= GENHD_FL_HIDDEN;
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
/* Hide the block-interface for these devices */
ns->disk->flags |= GENHD_FL_HIDDEN;
set_bit(NVME_NS_READY, &ns->flags);
return 0;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
struct nvme_id_ns *id;
unsigned lbaf;
int ret;
ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
if (ret)
return ret;
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
ns->lba_shift = id->lbaf[lbaf].ds;
nvme_set_queue_limits(ns->ctrl, ns->queue);
@ -1927,36 +2001,35 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
if (ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf);
if (ret)
goto out_unfreeze;
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
}
set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) ||
test_bit(NVME_NS_FORCE_RO, &ns->flags));
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
ret = nvme_revalidate_zones(ns);
if (ret && !nvme_first_scan(ns->disk))
return ret;
goto out;
}
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
nvme_update_disk_info(ns->head->disk, ns, id);
set_disk_ro(ns->head->disk,
(id->nsattr & NVME_NS_ATTR_RO) ||
test_bit(NVME_NS_FORCE_RO, &ns->flags));
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
disk_update_readahead(ns->head->disk);
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
return 0;
out_unfreeze:
ret = 0;
out:
/*
* If probing fails due an unsupported feature, hide the block device,
* but still allow other access.
@ -1966,10 +2039,31 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
set_bit(NVME_NS_READY, &ns->flags);
ret = 0;
}
blk_mq_unfreeze_queue(ns->disk->queue);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
return nvme_update_ns_info_generic(ns, info);
}
return nvme_update_ns_info_block(ns, info);
case NVME_CSI_NVM:
return nvme_update_ns_info_block(ns, info);
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
return nvme_update_ns_info_generic(ns, info);
}
}
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
@ -2103,7 +2197,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
cmd.common.cdw11 = cpu_to_le32(len);
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
NVME_QID_ANY, 1, 0);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
@ -2123,6 +2217,7 @@ static int nvme_report_zones(struct gendisk *disk, sector_t sector,
static const struct block_device_operations nvme_bdev_ops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
@ -3613,6 +3708,108 @@ static ssize_t dctype_show(struct device *dev,
}
static DEVICE_ATTR_RO(dctype);
#ifdef CONFIG_NVME_AUTH
static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct nvmf_ctrl_options *opts = ctrl->opts;
if (!opts->dhchap_secret)
return sysfs_emit(buf, "none\n");
return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
}
static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct nvmf_ctrl_options *opts = ctrl->opts;
char *dhchap_secret;
if (!ctrl->opts->dhchap_secret)
return -EINVAL;
if (count < 7)
return -EINVAL;
if (memcmp(buf, "DHHC-1:", 7))
return -EINVAL;
dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
if (!dhchap_secret)
return -ENOMEM;
memcpy(dhchap_secret, buf, count);
nvme_auth_stop(ctrl);
if (strcmp(dhchap_secret, opts->dhchap_secret)) {
int ret;
ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
if (ret)
return ret;
kfree(opts->dhchap_secret);
opts->dhchap_secret = dhchap_secret;
/* Key has changed; re-authentication with new key */
nvme_auth_reset(ctrl);
}
/* Start re-authentication */
dev_info(ctrl->device, "re-authenticating controller\n");
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
return count;
}
static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct nvmf_ctrl_options *opts = ctrl->opts;
if (!opts->dhchap_ctrl_secret)
return sysfs_emit(buf, "none\n");
return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
}
static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct nvmf_ctrl_options *opts = ctrl->opts;
char *dhchap_secret;
if (!ctrl->opts->dhchap_ctrl_secret)
return -EINVAL;
if (count < 7)
return -EINVAL;
if (memcmp(buf, "DHHC-1:", 7))
return -EINVAL;
dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
if (!dhchap_secret)
return -ENOMEM;
memcpy(dhchap_secret, buf, count);
nvme_auth_stop(ctrl);
if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
int ret;
ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
if (ret)
return ret;
kfree(opts->dhchap_ctrl_secret);
opts->dhchap_ctrl_secret = dhchap_secret;
/* Key has changed; re-authentication with new key */
nvme_auth_reset(ctrl);
}
/* Start re-authentication */
dev_info(ctrl->device, "re-authenticating controller\n");
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
return count;
}
static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
#endif
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@ -3636,6 +3833,10 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_kato.attr,
&dev_attr_cntrltype.attr,
&dev_attr_dctype.attr,
#ifdef CONFIG_NVME_AUTH
&dev_attr_dhchap_secret.attr,
&dev_attr_dhchap_ctrl_secret.attr,
#endif
NULL
};
@ -3659,6 +3860,12 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
return 0;
#ifdef CONFIG_NVME_AUTH
if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
return 0;
if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
return 0;
#endif
return a->mode;
}
@ -3786,7 +3993,7 @@ static int nvme_add_ns_cdev(struct nvme_ns *ns)
}
static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
unsigned nsid, struct nvme_ns_ids *ids, bool is_shared)
struct nvme_ns_info *info)
{
struct nvme_ns_head *head;
size_t size = sizeof(*head);
@ -3808,9 +4015,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
if (ret)
goto out_ida_remove;
head->subsys = ctrl->subsys;
head->ns_id = nsid;
head->ids = *ids;
head->shared = is_shared;
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
kref_init(&head->ref);
if (head->ids.csi) {
@ -3867,54 +4074,54 @@ static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
return ret;
}
static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
struct nvme_ns_ids *ids, bool is_shared)
static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
{
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_ns_head *head = NULL;
int ret;
ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
if (ret) {
dev_err(ctrl->device,
"globally duplicate IDs for nsid %d\n", nsid);
"globally duplicate IDs for nsid %d\n", info->nsid);
nvme_print_device_info(ctrl);
return ret;
}
mutex_lock(&ctrl->subsys->lock);
head = nvme_find_ns_head(ctrl, nsid);
head = nvme_find_ns_head(ctrl, info->nsid);
if (!head) {
ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
if (ret) {
dev_err(ctrl->device,
"duplicate IDs in subsystem for nsid %d\n",
nsid);
info->nsid);
goto out_unlock;
}
head = nvme_alloc_ns_head(ctrl, nsid, ids, is_shared);
head = nvme_alloc_ns_head(ctrl, info);
if (IS_ERR(head)) {
ret = PTR_ERR(head);
goto out_unlock;
}
} else {
ret = -EINVAL;
if (!is_shared || !head->shared) {
if (!info->is_shared || !head->shared) {
dev_err(ctrl->device,
"Duplicate unshared namespace %d\n", nsid);
"Duplicate unshared namespace %d\n",
info->nsid);
goto out_put_ns_head;
}
if (!nvme_ns_ids_equal(&head->ids, ids)) {
if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
nsid);
info->nsid);
goto out_put_ns_head;
}
if (!multipath && !list_empty(&head->list)) {
dev_warn(ctrl->device,
"Found shared namespace %d, but multipathing not supported.\n",
nsid);
info->nsid);
dev_warn_once(ctrl->device,
"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
}
@ -3968,20 +4175,15 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
list_add(&ns->list, &ns->ctrl->namespaces);
}
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
{
struct nvme_ns *ns;
struct gendisk *disk;
struct nvme_id_ns *id;
int node = ctrl->numa_node;
if (nvme_identify_ns(ctrl, nsid, ids, &id))
return;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
goto out_free_id;
return;
disk = blk_mq_alloc_disk(ctrl->tagset, ns);
if (IS_ERR(disk))
@ -4002,7 +4204,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
ns->ctrl = ctrl;
kref_init(&ns->kref);
if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
if (nvme_init_ns_head(ns, info))
goto out_cleanup_disk;
/*
@ -4028,7 +4230,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
ns->head->instance);
}
if (nvme_update_ns_info(ns, id))
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
down_write(&ctrl->namespaces_rwsem);
@ -4042,9 +4244,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (!nvme_ns_head_multipath(ns->head))
nvme_add_ns_cdev(ns);
nvme_mpath_add_disk(ns, id);
nvme_mpath_add_disk(ns, info->anagrpid);
nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
kfree(id);
return;
@ -4064,8 +4265,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
put_disk(disk);
out_free_ns:
kfree(ns);
out_free_id:
kfree(id);
}
static void nvme_ns_remove(struct nvme_ns *ns)
@ -4123,29 +4322,21 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
}
}
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{
struct nvme_id_ns *id;
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
if (test_bit(NVME_NS_DEAD, &ns->flags))
goto out;
ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
if (ret)
goto out;
ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
goto out_free_id;
goto out;
}
ret = nvme_update_ns_info(ns, id);
out_free_id:
kfree(id);
ret = nvme_update_ns_info(ns, info);
out:
/*
* Only remove the namespace if we got a fatal error back from the
@ -4157,59 +4348,47 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
nvme_ns_remove(ns);
}
static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns_ids ids = { };
struct nvme_id_ns_cs_indep *id;
struct nvme_ns_info info = { .nsid = nsid };
struct nvme_ns *ns;
bool ready = true;
if (nvme_identify_ns_descs(ctrl, nsid, &ids))
if (nvme_identify_ns_descs(ctrl, &info))
return;
/*
* Check if the namespace is ready. If not ignore it, we will get an
* AEN once it becomes ready and restart the scan.
*/
if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
!nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
ready = id->nstat & NVME_NSTAT_NRDY;
kfree(id);
if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
dev_warn(ctrl->device,
"command set not reported for nsid: %d\n", nsid);
return;
}
if (!ready)
/*
* If available try to use the Command Set Idependent Identify Namespace
* data structure to find all the generic information that is needed to
* set up a namespace. If not fall back to the legacy version.
*/
if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
(info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) {
if (nvme_ns_info_from_id_cs_indep(ctrl, &info))
return;
} else {
if (nvme_ns_info_from_identify(ctrl, &info))
return;
}
/*
* Ignore the namespace if it is not ready. We will get an AEN once it
* becomes ready and restart the scan.
*/
if (!info.is_ready)
return;
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
nvme_validate_ns(ns, &ids);
nvme_validate_ns(ns, &info);
nvme_put_ns(ns);
return;
}
switch (ids.csi) {
case NVME_CSI_NVM:
nvme_alloc_ns(ctrl, nsid, &ids);
break;
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_warn(ctrl->device,
"nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
nsid);
break;
}
if (!nvme_multi_css(ctrl)) {
dev_warn(ctrl->device,
"command set not reported for nsid: %d\n",
nsid);
break;
}
nvme_alloc_ns(ctrl, nsid, &ids);
break;
default:
dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
ids.csi, nsid);
break;
} else {
nvme_alloc_ns(ctrl, &info);
}
}
@ -4265,7 +4444,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
if (!nsid) /* end of the list? */
goto out;
nvme_validate_or_alloc_ns(ctrl, nsid);
nvme_scan_ns(ctrl, nsid);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
@ -4288,7 +4467,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
kfree(id);
for (i = 1; i <= nn; i++)
nvme_validate_or_alloc_ns(ctrl, i);
nvme_scan_ns(ctrl, i);
nvme_remove_invalid_namespaces(ctrl, nn);
}
@ -4525,9 +4704,19 @@ static void nvme_fw_act_work(struct work_struct *work)
nvme_get_fw_slot_info(ctrl);
}
static u32 nvme_aer_type(u32 result)
{
return result & 0x7;
}
static u32 nvme_aer_subtype(u32 result)
{
return (result & 0xff00) >> 8;
}
static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
{
u32 aer_notice_type = (result & 0xff00) >> 8;
u32 aer_notice_type = nvme_aer_subtype(result);
trace_nvme_async_event(ctrl, aer_notice_type);
@ -4542,8 +4731,10 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
* recovery actions from interfering with the controller's
* firmware activation.
*/
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
nvme_auth_stop(ctrl);
queue_work(nvme_wq, &ctrl->fw_act_work);
}
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
@ -4560,11 +4751,19 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
}
}
static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
{
trace_nvme_async_event(ctrl, NVME_AER_ERROR);
dev_warn(ctrl->device, "resetting controller due to AER\n");
nvme_reset_ctrl(ctrl);
}
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
volatile union nvme_result *res)
{
u32 result = le32_to_cpu(res->u32);
u32 aer_type = result & 0x07;
u32 aer_type = nvme_aer_type(result);
u32 aer_subtype = nvme_aer_subtype(result);
if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
return;
@ -4574,6 +4773,15 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
nvme_handle_aen_notice(ctrl, result);
break;
case NVME_AER_ERROR:
/*
* For a persistent internal error, don't run async_event_work
* to submit a new AER. The controller reset will do it.
*/
if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
nvme_handle_aer_persistent_error(ctrl);
return;
}
fallthrough;
case NVME_AER_SMART:
case NVME_AER_CSS:
case NVME_AER_VS:
@ -4590,6 +4798,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme_mpath_stop(ctrl);
nvme_auth_stop(ctrl);
nvme_stop_keep_alive(ctrl);
nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
@ -4649,6 +4858,8 @@ static void nvme_free_ctrl(struct device *dev)
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
if (subsys) {
@ -4739,6 +4950,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
nvme_mpath_init_ctrl(ctrl);
nvme_auth_init_ctrl(ctrl);
return 0;
out_free_name:

View File

@ -152,7 +152,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
@ -198,7 +198,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0)
@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0,
NVME_QID_ANY, 0, 0);
if (unlikely(ret))
dev_err(ctrl->device,
@ -331,6 +331,10 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
dev_err(ctrl->device,
"Connect command failed: host path error\n");
break;
case NVME_SC_AUTH_REQUIRED:
dev_err(ctrl->device,
"Connect command failed: authentication required\n");
break;
default:
dev_err(ctrl->device,
"Connect command failed, error wo/DNR bit: %d\n",
@ -365,6 +369,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
union nvme_result res;
struct nvmf_connect_data *data;
int ret;
u32 result;
cmd.connect.opcode = nvme_fabrics_command;
cmd.connect.fctype = nvme_fabrics_type_connect;
@ -389,7 +394,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
data, sizeof(*data), NVME_QID_ANY, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
@ -397,8 +402,25 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
goto out_free_data;
}
ctrl->cntlid = le16_to_cpu(res.u16);
result = le32_to_cpu(res.u32);
ctrl->cntlid = result & 0xFFFF;
if ((result >> 16) & 0x3) {
/* Authentication required */
ret = nvme_auth_negotiate(ctrl, 0);
if (ret) {
dev_warn(ctrl->device,
"qid 0: authentication setup failed\n");
ret = NVME_SC_AUTH_REQUIRED;
goto out_free_data;
}
ret = nvme_auth_wait(ctrl, 0);
if (ret)
dev_warn(ctrl->device,
"qid 0: authentication failed\n");
else
dev_info(ctrl->device,
"qid 0: authenticated\n");
}
out_free_data:
kfree(data);
return ret;
@ -431,6 +453,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
struct nvmf_connect_data *data;
union nvme_result res;
int ret;
u32 result;
cmd.connect.opcode = nvme_fabrics_command;
cmd.connect.fctype = nvme_fabrics_type_connect;
@ -450,12 +473,27 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
data, sizeof(*data), 0, qid, 1,
data, sizeof(*data), qid, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
}
result = le32_to_cpu(res.u32);
if ((result >> 16) & 2) {
/* Authentication required */
ret = nvme_auth_negotiate(ctrl, qid);
if (ret) {
dev_warn(ctrl->device,
"qid %d: authentication setup failed\n", qid);
ret = NVME_SC_AUTH_REQUIRED;
} else {
ret = nvme_auth_wait(ctrl, qid);
if (ret)
dev_warn(ctrl->device,
"qid %u: authentication failed\n", qid);
}
}
kfree(data);
return ret;
}
@ -548,6 +586,8 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_TOS, "tos=%d" },
{ NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" },
{ NVMF_OPT_DISCOVERY, "discovery" },
{ NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" },
{ NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" },
{ NVMF_OPT_ERR, NULL }
};
@ -829,6 +869,34 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
case NVMF_OPT_DISCOVERY:
opts->discovery_nqn = true;
break;
case NVMF_OPT_DHCHAP_SECRET:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
pr_err("Invalid DH-CHAP secret %s\n", p);
ret = -EINVAL;
goto out;
}
kfree(opts->dhchap_secret);
opts->dhchap_secret = p;
break;
case NVMF_OPT_DHCHAP_CTRL_SECRET:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
pr_err("Invalid DH-CHAP secret %s\n", p);
ret = -EINVAL;
goto out;
}
kfree(opts->dhchap_ctrl_secret);
opts->dhchap_ctrl_secret = p;
break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@ -947,6 +1015,8 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
kfree(opts->subsysnqn);
kfree(opts->host_traddr);
kfree(opts->host_iface);
kfree(opts->dhchap_secret);
kfree(opts->dhchap_ctrl_secret);
kfree(opts);
}
EXPORT_SYMBOL_GPL(nvmf_free_options);
@ -956,7 +1026,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
NVMF_OPT_FAIL_FAST_TMO)
NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\
NVMF_OPT_DHCHAP_CTRL_SECRET)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf)
@ -1192,7 +1263,14 @@ static void __exit nvmf_exit(void)
BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8);
BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16);
BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16);
BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16);
BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16);
}
MODULE_LICENSE("GPL v2");

View File

@ -68,6 +68,8 @@ enum {
NVMF_OPT_FAIL_FAST_TMO = 1 << 20,
NVMF_OPT_HOST_IFACE = 1 << 21,
NVMF_OPT_DISCOVERY = 1 << 22,
NVMF_OPT_DHCHAP_SECRET = 1 << 23,
NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
};
/**
@ -97,6 +99,9 @@ enum {
* @max_reconnects: maximum number of allowed reconnect attempts before removing
* the controller, (-1) means reconnect forever, zero means remove
* immediately;
* @dhchap_secret: DH-HMAC-CHAP secret
* @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
* authentication
* @disable_sqflow: disable controller sq flow control
* @hdr_digest: generate/verify header digest (TCP)
* @data_digest: generate/verify data digest (TCP)
@ -121,6 +126,8 @@ struct nvmf_ctrl_options {
unsigned int kato;
struct nvmf_host *host;
int max_reconnects;
char *dhchap_secret;
char *dhchap_ctrl_secret;
bool disable_sqflow;
bool hdr_digest;
bool data_digest;

View File

@ -346,7 +346,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
* different queue via blk_steal_bios(), so we need to use the bio_split
* pool from the original queue to allocate the bvecs from.
*/
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
@ -408,6 +408,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.open = nvme_ns_head_open,
.release = nvme_ns_head_release,
.ioctl = nvme_ns_head_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = nvme_getgeo,
.report_zones = nvme_ns_head_report_zones,
.pr_ops = &nvme_pr_ops,
@ -800,16 +801,16 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
return -ENXIO; /* just break out of the loop */
}
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
struct nvme_ana_group_desc desc = {
.grpid = id->anagrpid,
.grpid = anagrpid,
.state = 0,
};
mutex_lock(&ns->ctrl->ana_lock);
ns->ana_grpid = le32_to_cpu(id->anagrpid);
ns->ana_grpid = le32_to_cpu(anagrpid);
nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
mutex_unlock(&ns->ctrl->ana_lock);
if (desc.state) {

View File

@ -140,7 +140,7 @@ enum nvme_quirks {
NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16),
/*
* The controller requires the command_id value be be limited, so skip
* The controller requires the command_id value be limited, so skip
* encoding the generation sequence number.
*/
NVME_QUIRK_SKIP_CID_GEN = (1 << 17),
@ -328,6 +328,15 @@ struct nvme_ctrl {
struct work_struct ana_work;
#endif
#ifdef CONFIG_NVME_AUTH
struct work_struct dhchap_auth_work;
struct list_head dhchap_auth_list;
struct mutex dhchap_auth_mutex;
struct nvme_dhchap_key *host_key;
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
#endif
/* Power saving configuration */
u64 ps_max_latency_us;
bool apst_enabled;
@ -781,7 +790,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
int qid, int at_head,
blk_mq_req_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
@ -837,7 +846,7 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
@ -879,8 +888,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{
return 0;
}
static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
struct nvme_id_ns *id)
static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@ -992,6 +1000,27 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
#ifdef CONFIG_NVME_AUTH
void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_auth_stop(struct nvme_ctrl *ctrl);
int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
void nvme_auth_reset(struct nvme_ctrl *ctrl);
void nvme_auth_free(struct nvme_ctrl *ctrl);
#else
static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
{
return -EPROTONOSUPPORT;
}
static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
{
return NVME_SC_AUTH_REQUIRED;
}
static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
#endif
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode);
int nvme_execute_passthru_rq(struct request *rq);

View File

@ -670,7 +670,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) {
iod->first_dma = dma_addr;
iod->npages = -1;
return BLK_STS_RESOURCE;
}
@ -1435,8 +1434,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
dev_warn(nvmeq->dev->ctrl.device,
"I/O %d QID %d timeout, aborting\n",
req->tag, nvmeq->qid);
"I/O %d (%s) QID %d timeout, aborting\n",
req->tag,
nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
nvmeq->qid);
abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
BLK_MQ_REQ_NOWAIT);
@ -1765,37 +1766,35 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
}
}
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
{
if (!dev->ctrl.admin_q) {
dev->admin_tagset.ops = &nvme_mq_admin_ops;
dev->admin_tagset.nr_hw_queues = 1;
struct blk_mq_tag_set *set = &dev->admin_tagset;
dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev->ctrl.numa_node;
dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
set->ops = &nvme_mq_admin_ops;
set->nr_hw_queues = 1;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
return -ENOMEM;
dev->ctrl.admin_tagset = &dev->admin_tagset;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->timeout = NVME_ADMIN_TIMEOUT;
set->numa_node = dev->ctrl.numa_node;
set->cmd_size = sizeof(struct nvme_iod);
set->flags = BLK_MQ_F_NO_SCHED;
set->driver_data = dev;
dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
if (IS_ERR(dev->ctrl.admin_q)) {
blk_mq_free_tag_set(&dev->admin_tagset);
dev->ctrl.admin_q = NULL;
return -ENOMEM;
}
if (!blk_get_queue(dev->ctrl.admin_q)) {
nvme_dev_remove_admin(dev);
dev->ctrl.admin_q = NULL;
return -ENODEV;
}
} else
nvme_start_admin_queue(&dev->ctrl);
if (blk_mq_alloc_tag_set(set))
return -ENOMEM;
dev->ctrl.admin_tagset = set;
dev->ctrl.admin_q = blk_mq_init_queue(set);
if (IS_ERR(dev->ctrl.admin_q)) {
blk_mq_free_tag_set(set);
dev->ctrl.admin_q = NULL;
return -ENOMEM;
}
if (!blk_get_queue(dev->ctrl.admin_q)) {
nvme_dev_remove_admin(dev);
dev->ctrl.admin_q = NULL;
return -ENODEV;
}
return 0;
}
@ -2534,47 +2533,45 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
static void nvme_dev_add(struct nvme_dev *dev)
static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
{
struct blk_mq_tag_set * set = &dev->tagset;
int ret;
if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
dev->tagset.nr_maps = 2; /* default + read */
if (dev->io_queues[HCTX_TYPE_POLL])
dev->tagset.nr_maps++;
dev->tagset.timeout = NVME_IO_TIMEOUT;
dev->tagset.numa_node = dev->ctrl.numa_node;
dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.cmd_size = sizeof(struct nvme_iod);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
set->ops = &nvme_mq_ops;
set->nr_hw_queues = dev->online_queues - 1;
set->nr_maps = 2; /* default + read */
if (dev->io_queues[HCTX_TYPE_POLL])
set->nr_maps++;
set->timeout = NVME_IO_TIMEOUT;
set->numa_node = dev->ctrl.numa_node;
set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
set->cmd_size = sizeof(struct nvme_iod);
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->driver_data = dev;
/*
* Some Apple controllers requires tags to be unique
* across admin and IO queue, so reserve the first 32
* tags of the IO queue.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
dev->tagset.reserved_tags = NVME_AQ_DEPTH;
/*
* Some Apple controllers requires tags to be unique
* across admin and IO queue, so reserve the first 32
* tags of the IO queue.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
set->reserved_tags = NVME_AQ_DEPTH;
ret = blk_mq_alloc_tag_set(&dev->tagset);
if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
return;
}
dev->ctrl.tagset = &dev->tagset;
} else {
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
/* Free previously allocated queues that are no longer usable */
nvme_free_queues(dev, dev->online_queues);
ret = blk_mq_alloc_tag_set(set);
if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
return;
}
dev->ctrl.tagset = set;
}
nvme_dbbuf_set(dev);
static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
{
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
/* free previously allocated queues that are no longer usable */
nvme_free_queues(dev, dev->online_queues);
}
static int nvme_pci_enable(struct nvme_dev *dev)
@ -2725,10 +2722,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_pci_disable(dev);
nvme_reap_pending_cqes(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_wait_completed_request(&dev->tagset);
blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
nvme_cancel_tagset(&dev->ctrl);
nvme_cancel_admin_tagset(&dev->ctrl);
/*
* The driver will not be starting up queues again if shutting down so
@ -2842,9 +2837,13 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out_unlock;
result = nvme_alloc_admin_tags(dev);
if (result)
goto out_unlock;
if (!dev->ctrl.admin_q) {
result = nvme_pci_alloc_admin_tag_set(dev);
if (result)
goto out_unlock;
} else {
nvme_start_admin_queue(&dev->ctrl);
}
/*
* Limit the max command size to prevent iod->sg allocations going
@ -2923,7 +2922,11 @@ static void nvme_reset_work(struct work_struct *work)
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
nvme_dev_add(dev);
if (!dev->ctrl.tagset)
nvme_pci_alloc_tag_set(dev);
else
nvme_pci_update_nr_queues(dev);
nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
}

View File

@ -29,7 +29,7 @@
#include "fabrics.h"
#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_MAX_SEGMENTS 256
@ -248,12 +248,9 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
{
int ret;
ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
if (ret < 0)
ret = wait_for_completion_interruptible(&queue->cm_done);
if (ret)
return ret;
if (ret == 0)
return -ETIMEDOUT;
WARN_ON_ONCE(queue->cm_error > 0);
return queue->cm_error;
}
@ -612,7 +609,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
queue->cm_error = -ETIMEDOUT;
ret = rdma_resolve_addr(queue->cm_id, src_addr,
(struct sockaddr *)&ctrl->addr,
NVME_RDMA_CONNECT_TIMEOUT_MS);
NVME_RDMA_CM_TIMEOUT_MS);
if (ret) {
dev_info(ctrl->ctrl.device,
"rdma_resolve_addr failed (%d).\n", ret);
@ -790,50 +787,54 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
return ret;
}
static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
struct blk_mq_tag_set *set;
struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
int ret;
if (admin) {
set = &ctrl->admin_tag_set;
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
set->flags = BLK_MQ_F_NO_SCHED;
} else {
set = &ctrl->tag_set;
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE;
if (nctrl->max_integrity_segments)
set->cmd_size += sizeof(struct nvme_rdma_sgl) +
NVME_RDMA_METADATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
set->flags = BLK_MQ_F_NO_SCHED;
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ERR_PTR(ret);
if (!ret)
ctrl->ctrl.admin_tagset = set;
return ret;
}
return set;
static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
struct blk_mq_tag_set *set = &ctrl->tag_set;
int ret;
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE;
if (nctrl->max_integrity_segments)
set->cmd_size += sizeof(struct nvme_rdma_sgl) +
NVME_RDMA_METADATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
ret = blk_mq_alloc_tag_set(set);
if (!ret)
ctrl->ctrl.tagset = set;
return ret;
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@ -885,11 +886,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_queue;
if (new) {
ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
if (IS_ERR(ctrl->ctrl.admin_tagset)) {
error = PTR_ERR(ctrl->ctrl.admin_tagset);
error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl);
if (error)
goto out_free_async_qe;
}
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
@ -972,11 +971,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
return ret;
if (new) {
ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
if (IS_ERR(ctrl->ctrl.tagset)) {
ret = PTR_ERR(ctrl->ctrl.tagset);
ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl);
if (ret)
goto out_free_io_queues;
}
ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
if (ret)
@ -1205,6 +1202,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, err_work);
nvme_auth_stop(&ctrl->ctrl);
nvme_stop_keep_alive(&ctrl->ctrl);
flush_work(&ctrl->ctrl.async_event_work);
nvme_rdma_teardown_io_queues(ctrl, false);
@ -1894,7 +1892,7 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
if (ctrl->opts->tos >= 0)
rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS);
if (ret) {
dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);

View File

@ -209,9 +209,11 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
{
return queue->cmnd_capsule_len - sizeof(struct nvme_command);
if (nvme_is_fabrics(req->req.cmd))
return NVME_TCP_ADMIN_CCSZ;
return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
}
static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
@ -229,7 +231,7 @@ static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
rq = blk_mq_rq_from_pdu(req);
return rq_data_dir(rq) == WRITE && req->data_len &&
req->data_len <= nvme_tcp_inline_data_size(req->queue);
req->data_len <= nvme_tcp_inline_data_size(req);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@ -1685,45 +1687,49 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
return ret;
}
static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct blk_mq_tag_set *set;
struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
int ret;
if (admin) {
set = &ctrl->admin_tag_set;
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
} else {
set = &ctrl->tag_set;
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ERR_PTR(ret);
if (!ret)
nctrl->admin_tagset = set;
return ret;
}
return set;
static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct blk_mq_tag_set *set = &ctrl->tag_set;
int ret;
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
ret = blk_mq_alloc_tag_set(set);
if (!ret)
nctrl->tagset = set;
return ret;
}
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
@ -1899,11 +1905,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
return ret;
if (new) {
ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
if (IS_ERR(ctrl->tagset)) {
ret = PTR_ERR(ctrl->tagset);
ret = nvme_tcp_alloc_tag_set(ctrl);
if (ret)
goto out_free_io_queues;
}
ret = nvme_ctrl_init_connect_q(ctrl);
if (ret)
@ -1968,11 +1972,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
return error;
if (new) {
ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
if (IS_ERR(ctrl->admin_tagset)) {
error = PTR_ERR(ctrl->admin_tagset);
error = nvme_tcp_alloc_admin_tag_set(ctrl);
if (error)
goto out_free_queue;
}
ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->fabrics_q)) {
@ -2173,6 +2175,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
nvme_auth_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
@ -2371,7 +2374,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
if (!blk_rq_nr_phys_segments(rq))
nvme_tcp_set_sg_null(c);
else if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
req->data_len <= nvme_tcp_inline_data_size(req))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
nvme_tcp_set_sg_host_data(c, req->data_len);
@ -2406,7 +2409,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
nvme_tcp_init_iter(req, rq_data_dir(rq));
if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
req->data_len <= nvme_tcp_inline_data_size(req))
req->pdu_len = req->data_len;
pdu->hdr.type = nvme_tcp_cmd;

View File

@ -287,6 +287,34 @@ static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
return ret;
}
static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc)
{
const char *ret = trace_seq_buffer_ptr(p);
u8 spsp0 = spc[1];
u8 spsp1 = spc[2];
u8 secp = spc[3];
u32 tl = get_unaligned_le32(spc + 4);
trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u",
spsp0, spsp1, secp, tl);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc)
{
const char *ret = trace_seq_buffer_ptr(p);
u8 spsp0 = spc[1];
u8 spsp1 = spc[2];
u8 secp = spc[3];
u32 al = get_unaligned_le32(spc + 4);
trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u",
spsp0, spsp1, secp, al);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
{
const char *ret = trace_seq_buffer_ptr(p);
@ -306,6 +334,10 @@ const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
return nvme_trace_fabrics_connect(p, spc);
case nvme_fabrics_type_property_get:
return nvme_trace_fabrics_property_get(p, spc);
case nvme_fabrics_type_auth_send:
return nvme_trace_fabrics_auth_send(p, spc);
case nvme_fabrics_type_auth_receive:
return nvme_trace_fabrics_auth_receive(p, spc);
default:
return nvme_trace_fabrics_common(p, spc);
}

View File

@ -98,7 +98,7 @@ TRACE_EVENT(nvme_complete_rq,
TP_fast_assign(
__entry->ctrl_id = nvme_req(req)->ctrl->instance;
__entry->qid = nvme_req_qid(req);
__entry->cid = req->tag;
__entry->cid = nvme_req(req)->cmd->common.command_id;
__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
__entry->retries = nvme_req(req)->retries;
__entry->flags = nvme_req(req)->flags;

View File

@ -83,3 +83,18 @@ config NVME_TARGET_TCP
devices over TCP.
If unsure, say N.
config NVME_TARGET_AUTH
bool "NVMe over Fabrics In-band Authentication support"
depends on NVME_TARGET
select NVME_COMMON
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_DH
select CRYPTO_DH_RFC7919_GROUPS
help
This enables support for NVMe over Fabrics In-band Authentication
If unsure, say N.

View File

@ -13,6 +13,7 @@ nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
discovery.o io-cmd-file.o io-cmd-bdev.o
nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvmet-$(CONFIG_NVME_TARGET_AUTH) += fabrics-cmd-auth.o auth.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o

View File

@ -1017,7 +1017,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
u16 ret;
if (nvme_is_fabrics(cmd))
return nvmet_parse_fabrics_cmd(req);
return nvmet_parse_fabrics_admin_cmd(req);
if (unlikely(!nvmet_check_auth_status(req)))
return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
return nvmet_parse_discovery_cmd(req);

525
drivers/nvme/target/auth.c Normal file
View File

@ -0,0 +1,525 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics DH-HMAC-CHAP authentication.
* Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions.
* All rights reserved.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <crypto/hash.h>
#include <linux/crc32.h>
#include <linux/base64.h>
#include <linux/ctype.h>
#include <linux/random.h>
#include <linux/nvme-auth.h>
#include <asm/unaligned.h>
#include "nvmet.h"
int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
bool set_ctrl)
{
unsigned char key_hash;
char *dhchap_secret;
if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1)
return -EINVAL;
if (key_hash > 3) {
pr_warn("Invalid DH-HMAC-CHAP hash id %d\n",
key_hash);
return -EINVAL;
}
if (key_hash > 0) {
/* Validate selected hash algorithm */
const char *hmac = nvme_auth_hmac_name(key_hash);
if (!crypto_has_shash(hmac, 0, 0)) {
pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac);
return -ENOTSUPP;
}
}
dhchap_secret = kstrdup(secret, GFP_KERNEL);
if (!dhchap_secret)
return -ENOMEM;
if (set_ctrl) {
host->dhchap_ctrl_secret = strim(dhchap_secret);
host->dhchap_ctrl_key_hash = key_hash;
} else {
host->dhchap_secret = strim(dhchap_secret);
host->dhchap_key_hash = key_hash;
}
return 0;
}
int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
{
const char *dhgroup_kpp;
int ret = 0;
pr_debug("%s: ctrl %d selecting dhgroup %d\n",
__func__, ctrl->cntlid, dhgroup_id);
if (ctrl->dh_tfm) {
if (ctrl->dh_gid == dhgroup_id) {
pr_debug("%s: ctrl %d reuse existing DH group %d\n",
__func__, ctrl->cntlid, dhgroup_id);
return 0;
}
crypto_free_kpp(ctrl->dh_tfm);
ctrl->dh_tfm = NULL;
ctrl->dh_gid = 0;
}
if (dhgroup_id == NVME_AUTH_DHGROUP_NULL)
return 0;
dhgroup_kpp = nvme_auth_dhgroup_kpp(dhgroup_id);
if (!dhgroup_kpp) {
pr_debug("%s: ctrl %d invalid DH group %d\n",
__func__, ctrl->cntlid, dhgroup_id);
return -EINVAL;
}
ctrl->dh_tfm = crypto_alloc_kpp(dhgroup_kpp, 0, 0);
if (IS_ERR(ctrl->dh_tfm)) {
pr_debug("%s: ctrl %d failed to setup DH group %d, err %ld\n",
__func__, ctrl->cntlid, dhgroup_id,
PTR_ERR(ctrl->dh_tfm));
ret = PTR_ERR(ctrl->dh_tfm);
ctrl->dh_tfm = NULL;
ctrl->dh_gid = 0;
} else {
ctrl->dh_gid = dhgroup_id;
pr_debug("%s: ctrl %d setup DH group %d\n",
__func__, ctrl->cntlid, ctrl->dh_gid);
ret = nvme_auth_gen_privkey(ctrl->dh_tfm, ctrl->dh_gid);
if (ret < 0) {
pr_debug("%s: ctrl %d failed to generate private key, err %d\n",
__func__, ctrl->cntlid, ret);
kfree_sensitive(ctrl->dh_key);
return ret;
}
ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm);
kfree_sensitive(ctrl->dh_key);
ctrl->dh_key = kzalloc(ctrl->dh_keysize, GFP_KERNEL);
if (!ctrl->dh_key) {
pr_warn("ctrl %d failed to allocate public key\n",
ctrl->cntlid);
return -ENOMEM;
}
ret = nvme_auth_gen_pubkey(ctrl->dh_tfm, ctrl->dh_key,
ctrl->dh_keysize);
if (ret < 0) {
pr_warn("ctrl %d failed to generate public key\n",
ctrl->cntlid);
kfree(ctrl->dh_key);
ctrl->dh_key = NULL;
}
}
return ret;
}
int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
{
int ret = 0;
struct nvmet_host_link *p;
struct nvmet_host *host = NULL;
const char *hash_name;
down_read(&nvmet_config_sem);
if (nvmet_is_disc_subsys(ctrl->subsys))
goto out_unlock;
if (ctrl->subsys->allow_any_host)
goto out_unlock;
list_for_each_entry(p, &ctrl->subsys->hosts, entry) {
pr_debug("check %s\n", nvmet_host_name(p->host));
if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn))
continue;
host = p->host;
break;
}
if (!host) {
pr_debug("host %s not found\n", ctrl->hostnqn);
ret = -EPERM;
goto out_unlock;
}
ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id);
if (ret < 0)
pr_warn("Failed to setup DH group");
if (!host->dhchap_secret) {
pr_debug("No authentication provided\n");
goto out_unlock;
}
if (host->dhchap_hash_id == ctrl->shash_id) {
pr_debug("Re-use existing hash ID %d\n",
ctrl->shash_id);
} else {
hash_name = nvme_auth_hmac_name(host->dhchap_hash_id);
if (!hash_name) {
pr_warn("Hash ID %d invalid\n", host->dhchap_hash_id);
ret = -EINVAL;
goto out_unlock;
}
ctrl->shash_id = host->dhchap_hash_id;
}
/* Skip the 'DHHC-1:XX:' prefix */
nvme_auth_free_key(ctrl->host_key);
ctrl->host_key = nvme_auth_extract_key(host->dhchap_secret + 10,
host->dhchap_key_hash);
if (IS_ERR(ctrl->host_key)) {
ret = PTR_ERR(ctrl->host_key);
ctrl->host_key = NULL;
goto out_free_hash;
}
pr_debug("%s: using hash %s key %*ph\n", __func__,
ctrl->host_key->hash > 0 ?
nvme_auth_hmac_name(ctrl->host_key->hash) : "none",
(int)ctrl->host_key->len, ctrl->host_key->key);
nvme_auth_free_key(ctrl->ctrl_key);
if (!host->dhchap_ctrl_secret) {
ctrl->ctrl_key = NULL;
goto out_unlock;
}
ctrl->ctrl_key = nvme_auth_extract_key(host->dhchap_ctrl_secret + 10,
host->dhchap_ctrl_key_hash);
if (IS_ERR(ctrl->ctrl_key)) {
ret = PTR_ERR(ctrl->ctrl_key);
ctrl->ctrl_key = NULL;
}
pr_debug("%s: using ctrl hash %s key %*ph\n", __func__,
ctrl->ctrl_key->hash > 0 ?
nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none",
(int)ctrl->ctrl_key->len, ctrl->ctrl_key->key);
out_free_hash:
if (ret) {
if (ctrl->host_key) {
nvme_auth_free_key(ctrl->host_key);
ctrl->host_key = NULL;
}
ctrl->shash_id = 0;
}
out_unlock:
up_read(&nvmet_config_sem);
return ret;
}
void nvmet_auth_sq_free(struct nvmet_sq *sq)
{
cancel_delayed_work(&sq->auth_expired_work);
kfree(sq->dhchap_c1);
sq->dhchap_c1 = NULL;
kfree(sq->dhchap_c2);
sq->dhchap_c2 = NULL;
kfree(sq->dhchap_skey);
sq->dhchap_skey = NULL;
}
void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
{
ctrl->shash_id = 0;
if (ctrl->dh_tfm) {
crypto_free_kpp(ctrl->dh_tfm);
ctrl->dh_tfm = NULL;
ctrl->dh_gid = 0;
}
kfree_sensitive(ctrl->dh_key);
ctrl->dh_key = NULL;
if (ctrl->host_key) {
nvme_auth_free_key(ctrl->host_key);
ctrl->host_key = NULL;
}
if (ctrl->ctrl_key) {
nvme_auth_free_key(ctrl->ctrl_key);
ctrl->ctrl_key = NULL;
}
}
bool nvmet_check_auth_status(struct nvmet_req *req)
{
if (req->sq->ctrl->host_key &&
!req->sq->authenticated)
return false;
return true;
}
int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
struct shash_desc *shash;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c1, *host_response;
u8 buf[4];
int ret;
hash_name = nvme_auth_hmac_name(ctrl->shash_id);
if (!hash_name) {
pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
return -EINVAL;
}
shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
if (IS_ERR(shash_tfm)) {
pr_err("failed to allocate shash %s\n", hash_name);
return PTR_ERR(shash_tfm);
}
if (shash_len != crypto_shash_digestsize(shash_tfm)) {
pr_debug("%s: hash len mismatch (len %d digest %d)\n",
__func__, shash_len,
crypto_shash_digestsize(shash_tfm));
ret = -EINVAL;
goto out_free_tfm;
}
host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn);
if (IS_ERR(host_response)) {
ret = PTR_ERR(host_response);
goto out_free_tfm;
}
ret = crypto_shash_setkey(shash_tfm, host_response,
ctrl->host_key->len);
if (ret)
goto out_free_response;
if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
challenge = kmalloc(shash_len, GFP_KERNEL);
if (!challenge) {
ret = -ENOMEM;
goto out_free_response;
}
ret = nvme_auth_augmented_challenge(ctrl->shash_id,
req->sq->dhchap_skey,
req->sq->dhchap_skey_len,
req->sq->dhchap_c1,
challenge, shash_len);
if (ret)
goto out_free_response;
}
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid);
shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
GFP_KERNEL);
if (!shash) {
ret = -ENOMEM;
goto out_free_response;
}
shash->tfm = shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
goto out;
ret = crypto_shash_update(shash, challenge, shash_len);
if (ret)
goto out;
put_unaligned_le32(req->sq->dhchap_s1, buf);
ret = crypto_shash_update(shash, buf, 4);
if (ret)
goto out;
put_unaligned_le16(req->sq->dhchap_tid, buf);
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
memset(buf, 0, 4);
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, "HostHost", 8);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
if (ret)
goto out;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->subsysnqn,
strlen(ctrl->subsysnqn));
if (ret)
goto out;
ret = crypto_shash_final(shash, response);
out:
if (challenge != req->sq->dhchap_c1)
kfree(challenge);
kfree(shash);
out_free_response:
kfree_sensitive(host_response);
out_free_tfm:
crypto_free_shash(shash_tfm);
return 0;
}
int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
struct shash_desc *shash;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c2, *ctrl_response;
u8 buf[4];
int ret;
hash_name = nvme_auth_hmac_name(ctrl->shash_id);
if (!hash_name) {
pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
return -EINVAL;
}
shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
if (IS_ERR(shash_tfm)) {
pr_err("failed to allocate shash %s\n", hash_name);
return PTR_ERR(shash_tfm);
}
if (shash_len != crypto_shash_digestsize(shash_tfm)) {
pr_debug("%s: hash len mismatch (len %d digest %d)\n",
__func__, shash_len,
crypto_shash_digestsize(shash_tfm));
ret = -EINVAL;
goto out_free_tfm;
}
ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
ctrl->subsysnqn);
if (IS_ERR(ctrl_response)) {
ret = PTR_ERR(ctrl_response);
goto out_free_tfm;
}
ret = crypto_shash_setkey(shash_tfm, ctrl_response,
ctrl->ctrl_key->len);
if (ret)
goto out_free_response;
if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
challenge = kmalloc(shash_len, GFP_KERNEL);
if (!challenge) {
ret = -ENOMEM;
goto out_free_response;
}
ret = nvme_auth_augmented_challenge(ctrl->shash_id,
req->sq->dhchap_skey,
req->sq->dhchap_skey_len,
req->sq->dhchap_c2,
challenge, shash_len);
if (ret)
goto out_free_response;
}
shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
GFP_KERNEL);
if (!shash) {
ret = -ENOMEM;
goto out_free_response;
}
shash->tfm = shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
goto out;
ret = crypto_shash_update(shash, challenge, shash_len);
if (ret)
goto out;
put_unaligned_le32(req->sq->dhchap_s2, buf);
ret = crypto_shash_update(shash, buf, 4);
if (ret)
goto out;
put_unaligned_le16(req->sq->dhchap_tid, buf);
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
memset(buf, 0, 4);
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, "Controller", 10);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->subsysnqn,
strlen(ctrl->subsysnqn));
if (ret)
goto out;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
if (ret)
goto out;
ret = crypto_shash_final(shash, response);
out:
if (challenge != req->sq->dhchap_c2)
kfree(challenge);
kfree(shash);
out_free_response:
kfree_sensitive(ctrl_response);
out_free_tfm:
crypto_free_shash(shash_tfm);
return 0;
}
int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
u8 *buf, int buf_size)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
int ret = 0;
if (!ctrl->dh_key) {
pr_warn("ctrl %d no DH public key!\n", ctrl->cntlid);
return -ENOKEY;
}
if (buf_size != ctrl->dh_keysize) {
pr_warn("ctrl %d DH public key size mismatch, need %zu is %d\n",
ctrl->cntlid, ctrl->dh_keysize, buf_size);
ret = -EINVAL;
} else {
memcpy(buf, ctrl->dh_key, buf_size);
pr_debug("%s: ctrl %d public key %*ph\n", __func__,
ctrl->cntlid, (int)buf_size, buf);
}
return ret;
}
int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
u8 *pkey, int pkey_size)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
int ret;
req->sq->dhchap_skey_len = ctrl->dh_keysize;
req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL);
if (!req->sq->dhchap_skey)
return -ENOMEM;
ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm,
pkey, pkey_size,
req->sq->dhchap_skey,
req->sq->dhchap_skey_len);
if (ret)
pr_debug("failed to compute shared secret, err %d\n", ret);
else
pr_debug("%s: shared secret %*ph\n", __func__,
(int)req->sq->dhchap_skey_len,
req->sq->dhchap_skey);
return ret;
}

View File

@ -11,6 +11,11 @@
#include <linux/ctype.h>
#include <linux/pci.h>
#include <linux/pci-p2pdma.h>
#ifdef CONFIG_NVME_TARGET_AUTH
#include <linux/nvme-auth.h>
#endif
#include <crypto/hash.h>
#include <crypto/kpp.h>
#include "nvmet.h"
@ -1680,10 +1685,133 @@ static const struct config_item_type nvmet_ports_type = {
static struct config_group nvmet_subsystems_group;
static struct config_group nvmet_ports_group;
#ifdef CONFIG_NVME_TARGET_AUTH
static ssize_t nvmet_host_dhchap_key_show(struct config_item *item,
char *page)
{
u8 *dhchap_secret = to_host(item)->dhchap_secret;
if (!dhchap_secret)
return sprintf(page, "\n");
return sprintf(page, "%s\n", dhchap_secret);
}
static ssize_t nvmet_host_dhchap_key_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_host *host = to_host(item);
int ret;
ret = nvmet_auth_set_key(host, page, false);
/*
* Re-authentication is a soft state, so keep the
* current authentication valid until the host
* requests re-authentication.
*/
return ret < 0 ? ret : count;
}
CONFIGFS_ATTR(nvmet_host_, dhchap_key);
static ssize_t nvmet_host_dhchap_ctrl_key_show(struct config_item *item,
char *page)
{
u8 *dhchap_secret = to_host(item)->dhchap_ctrl_secret;
if (!dhchap_secret)
return sprintf(page, "\n");
return sprintf(page, "%s\n", dhchap_secret);
}
static ssize_t nvmet_host_dhchap_ctrl_key_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_host *host = to_host(item);
int ret;
ret = nvmet_auth_set_key(host, page, true);
/*
* Re-authentication is a soft state, so keep the
* current authentication valid until the host
* requests re-authentication.
*/
return ret < 0 ? ret : count;
}
CONFIGFS_ATTR(nvmet_host_, dhchap_ctrl_key);
static ssize_t nvmet_host_dhchap_hash_show(struct config_item *item,
char *page)
{
struct nvmet_host *host = to_host(item);
const char *hash_name = nvme_auth_hmac_name(host->dhchap_hash_id);
return sprintf(page, "%s\n", hash_name ? hash_name : "none");
}
static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_host *host = to_host(item);
u8 hmac_id;
hmac_id = nvme_auth_hmac_id(page);
if (hmac_id == NVME_AUTH_HASH_INVALID)
return -EINVAL;
if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0))
return -ENOTSUPP;
host->dhchap_hash_id = hmac_id;
return count;
}
CONFIGFS_ATTR(nvmet_host_, dhchap_hash);
static ssize_t nvmet_host_dhchap_dhgroup_show(struct config_item *item,
char *page)
{
struct nvmet_host *host = to_host(item);
const char *dhgroup = nvme_auth_dhgroup_name(host->dhchap_dhgroup_id);
return sprintf(page, "%s\n", dhgroup ? dhgroup : "none");
}
static ssize_t nvmet_host_dhchap_dhgroup_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_host *host = to_host(item);
int dhgroup_id;
dhgroup_id = nvme_auth_dhgroup_id(page);
if (dhgroup_id == NVME_AUTH_DHGROUP_INVALID)
return -EINVAL;
if (dhgroup_id != NVME_AUTH_DHGROUP_NULL) {
const char *kpp = nvme_auth_dhgroup_kpp(dhgroup_id);
if (!crypto_has_kpp(kpp, 0, 0))
return -EINVAL;
}
host->dhchap_dhgroup_id = dhgroup_id;
return count;
}
CONFIGFS_ATTR(nvmet_host_, dhchap_dhgroup);
static struct configfs_attribute *nvmet_host_attrs[] = {
&nvmet_host_attr_dhchap_key,
&nvmet_host_attr_dhchap_ctrl_key,
&nvmet_host_attr_dhchap_hash,
&nvmet_host_attr_dhchap_dhgroup,
NULL,
};
#endif /* CONFIG_NVME_TARGET_AUTH */
static void nvmet_host_release(struct config_item *item)
{
struct nvmet_host *host = to_host(item);
#ifdef CONFIG_NVME_TARGET_AUTH
kfree(host->dhchap_secret);
#endif
kfree(host);
}
@ -1693,6 +1821,9 @@ static struct configfs_item_operations nvmet_host_item_ops = {
static const struct config_item_type nvmet_host_type = {
.ct_item_ops = &nvmet_host_item_ops,
#ifdef CONFIG_NVME_TARGET_AUTH
.ct_attrs = nvmet_host_attrs,
#endif
.ct_owner = THIS_MODULE,
};
@ -1705,6 +1836,11 @@ static struct config_group *nvmet_hosts_make_group(struct config_group *group,
if (!host)
return ERR_PTR(-ENOMEM);
#ifdef CONFIG_NVME_TARGET_AUTH
/* Default to SHA256 */
host->dhchap_hash_id = NVME_AUTH_HASH_SHA256;
#endif
config_group_init_type_name(&host->group, name, &nvmet_host_type);
return &host->group;

View File

@ -795,6 +795,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
wait_for_completion(&sq->confirm_done);
wait_for_completion(&sq->free_done);
percpu_ref_exit(&sq->ref);
nvmet_auth_sq_free(sq);
if (ctrl) {
/*
@ -865,8 +866,15 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
u16 ret;
if (nvme_is_fabrics(cmd))
return nvmet_parse_fabrics_io_cmd(req);
if (unlikely(!nvmet_check_auth_status(req)))
return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
ret = nvmet_check_ctrl_status(req);
if (unlikely(ret))
return ret;
@ -1271,6 +1279,11 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req)
req->cmd->common.opcode, req->sq->qid);
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
if (unlikely(!nvmet_check_auth_status(req))) {
pr_warn("qid %d not authenticated\n", req->sq->qid);
return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR;
}
return 0;
}
@ -1467,6 +1480,8 @@ static void nvmet_ctrl_free(struct kref *ref)
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fatal_err_work);
nvmet_destroy_auth(ctrl);
ida_free(&cntlid_ida, ctrl->cntlid);
nvmet_async_events_free(ctrl);

View File

@ -0,0 +1,544 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics DH-HMAC-CHAP authentication command handling.
* Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions.
* All rights reserved.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/nvme-auth.h>
#include <crypto/hash.h>
#include <crypto/kpp.h>
#include "nvmet.h"
static void nvmet_auth_expired_work(struct work_struct *work)
{
struct nvmet_sq *sq = container_of(to_delayed_work(work),
struct nvmet_sq, auth_expired_work);
pr_debug("%s: ctrl %d qid %d transaction %u expired, resetting\n",
__func__, sq->ctrl->cntlid, sq->qid, sq->dhchap_tid);
sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
sq->dhchap_tid = -1;
}
void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
{
u32 result = le32_to_cpu(req->cqe->result.u32);
/* Initialize in-band authentication */
INIT_DELAYED_WORK(&req->sq->auth_expired_work,
nvmet_auth_expired_work);
req->sq->authenticated = false;
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16;
req->cqe->result.u32 = cpu_to_le32(result);
}
static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmf_auth_dhchap_negotiate_data *data = d;
int i, hash_id = 0, fallback_hash_id = 0, dhgid, fallback_dhgid;
pr_debug("%s: ctrl %d qid %d: data sc_d %d napd %d authid %d halen %d dhlen %d\n",
__func__, ctrl->cntlid, req->sq->qid,
data->sc_c, data->napd, data->auth_protocol[0].dhchap.authid,
data->auth_protocol[0].dhchap.halen,
data->auth_protocol[0].dhchap.dhlen);
req->sq->dhchap_tid = le16_to_cpu(data->t_id);
if (data->sc_c)
return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
if (data->napd != 1)
return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
if (data->auth_protocol[0].dhchap.authid !=
NVME_AUTH_DHCHAP_AUTH_ID)
return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) {
u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i];
if (!fallback_hash_id &&
crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0))
fallback_hash_id = host_hmac_id;
if (ctrl->shash_id != host_hmac_id)
continue;
hash_id = ctrl->shash_id;
break;
}
if (hash_id == 0) {
if (fallback_hash_id == 0) {
pr_debug("%s: ctrl %d qid %d: no usable hash found\n",
__func__, ctrl->cntlid, req->sq->qid);
return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
}
pr_debug("%s: ctrl %d qid %d: no usable hash found, falling back to %s\n",
__func__, ctrl->cntlid, req->sq->qid,
nvme_auth_hmac_name(fallback_hash_id));
ctrl->shash_id = fallback_hash_id;
}
dhgid = -1;
fallback_dhgid = -1;
for (i = 0; i < data->auth_protocol[0].dhchap.dhlen; i++) {
int tmp_dhgid = data->auth_protocol[0].dhchap.idlist[i + 30];
if (tmp_dhgid != ctrl->dh_gid) {
dhgid = tmp_dhgid;
break;
}
if (fallback_dhgid < 0) {
const char *kpp = nvme_auth_dhgroup_kpp(tmp_dhgid);
if (crypto_has_kpp(kpp, 0, 0))
fallback_dhgid = tmp_dhgid;
}
}
if (dhgid < 0) {
if (fallback_dhgid < 0) {
pr_debug("%s: ctrl %d qid %d: no usable DH group found\n",
__func__, ctrl->cntlid, req->sq->qid);
return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
}
pr_debug("%s: ctrl %d qid %d: configured DH group %s not found\n",
__func__, ctrl->cntlid, req->sq->qid,
nvme_auth_dhgroup_name(fallback_dhgid));
ctrl->dh_gid = fallback_dhgid;
}
pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n",
__func__, ctrl->cntlid, req->sq->qid,
nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid);
return 0;
}
static u16 nvmet_auth_reply(struct nvmet_req *req, void *d)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmf_auth_dhchap_reply_data *data = d;
u16 dhvlen = le16_to_cpu(data->dhvlen);
u8 *response;
pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n",
__func__, ctrl->cntlid, req->sq->qid,
data->hl, data->cvalid, dhvlen);
if (dhvlen) {
if (!ctrl->dh_tfm)
return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
if (nvmet_auth_ctrl_sesskey(req, data->rval + 2 * data->hl,
dhvlen) < 0)
return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
}
response = kmalloc(data->hl, GFP_KERNEL);
if (!response)
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
if (!ctrl->host_key) {
pr_warn("ctrl %d qid %d no host key\n",
ctrl->cntlid, req->sq->qid);
kfree(response);
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
}
if (nvmet_auth_host_hash(req, response, data->hl) < 0) {
pr_debug("ctrl %d qid %d host hash failed\n",
ctrl->cntlid, req->sq->qid);
kfree(response);
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
}
if (memcmp(data->rval, response, data->hl)) {
pr_info("ctrl %d qid %d host response mismatch\n",
ctrl->cntlid, req->sq->qid);
kfree(response);
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
}
kfree(response);
pr_debug("%s: ctrl %d qid %d host authenticated\n",
__func__, ctrl->cntlid, req->sq->qid);
if (data->cvalid) {
req->sq->dhchap_c2 = kmalloc(data->hl, GFP_KERNEL);
if (!req->sq->dhchap_c2)
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
memcpy(req->sq->dhchap_c2, data->rval + data->hl, data->hl);
pr_debug("%s: ctrl %d qid %d challenge %*ph\n",
__func__, ctrl->cntlid, req->sq->qid, data->hl,
req->sq->dhchap_c2);
req->sq->dhchap_s2 = le32_to_cpu(data->seqnum);
} else {
req->sq->authenticated = true;
req->sq->dhchap_c2 = NULL;
}
return 0;
}
static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d)
{
struct nvmf_auth_dhchap_failure_data *data = d;
return data->rescode_exp;
}
void nvmet_execute_auth_send(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmf_auth_dhchap_success2_data *data;
void *d;
u32 tl;
u16 status = 0;
if (req->cmd->auth_send.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_send_command, secp);
goto done;
}
if (req->cmd->auth_send.spsp0 != 0x01) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_send_command, spsp0);
goto done;
}
if (req->cmd->auth_send.spsp1 != 0x01) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_send_command, spsp1);
goto done;
}
tl = le32_to_cpu(req->cmd->auth_send.tl);
if (!tl) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_send_command, tl);
goto done;
}
if (!nvmet_check_transfer_len(req, tl)) {
pr_debug("%s: transfer length mismatch (%u)\n", __func__, tl);
return;
}
d = kmalloc(tl, GFP_KERNEL);
if (!d) {
status = NVME_SC_INTERNAL;
goto done;
}
status = nvmet_copy_from_sgl(req, 0, d, tl);
if (status) {
kfree(d);
goto done;
}
data = d;
pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__,
ctrl->cntlid, req->sq->qid, data->auth_type, data->auth_id,
req->sq->dhchap_step);
if (data->auth_type != NVME_AUTH_COMMON_MESSAGES &&
data->auth_type != NVME_AUTH_DHCHAP_MESSAGES)
goto done_failure1;
if (data->auth_type == NVME_AUTH_COMMON_MESSAGES) {
if (data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE) {
/* Restart negotiation */
pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__,
ctrl->cntlid, req->sq->qid);
if (!req->sq->qid) {
if (nvmet_setup_auth(ctrl) < 0) {
status = NVME_SC_INTERNAL;
pr_err("ctrl %d qid 0 failed to setup"
"re-authentication",
ctrl->cntlid);
goto done_failure1;
}
}
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
} else if (data->auth_id != req->sq->dhchap_step)
goto done_failure1;
/* Validate negotiation parameters */
status = nvmet_auth_negotiate(req, d);
if (status == 0)
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE;
else {
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
req->sq->dhchap_status = status;
status = 0;
}
goto done_kfree;
}
if (data->auth_id != req->sq->dhchap_step) {
pr_debug("%s: ctrl %d qid %d step mismatch (%d != %d)\n",
__func__, ctrl->cntlid, req->sq->qid,
data->auth_id, req->sq->dhchap_step);
goto done_failure1;
}
if (le16_to_cpu(data->t_id) != req->sq->dhchap_tid) {
pr_debug("%s: ctrl %d qid %d invalid transaction %d (expected %d)\n",
__func__, ctrl->cntlid, req->sq->qid,
le16_to_cpu(data->t_id),
req->sq->dhchap_tid);
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
req->sq->dhchap_status =
NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
goto done_kfree;
}
switch (data->auth_id) {
case NVME_AUTH_DHCHAP_MESSAGE_REPLY:
status = nvmet_auth_reply(req, d);
if (status == 0)
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
else {
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
req->sq->dhchap_status = status;
status = 0;
}
goto done_kfree;
break;
case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2:
req->sq->authenticated = true;
pr_debug("%s: ctrl %d qid %d ctrl authenticated\n",
__func__, ctrl->cntlid, req->sq->qid);
goto done_kfree;
break;
case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2:
status = nvmet_auth_failure2(req, d);
if (status) {
pr_warn("ctrl %d qid %d: authentication failed (%d)\n",
ctrl->cntlid, req->sq->qid, status);
req->sq->dhchap_status = status;
req->sq->authenticated = false;
status = 0;
}
goto done_kfree;
break;
default:
req->sq->dhchap_status =
NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
req->sq->authenticated = false;
goto done_kfree;
break;
}
done_failure1:
req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
done_kfree:
kfree(d);
done:
pr_debug("%s: ctrl %d qid %d dhchap status %x step %x\n", __func__,
ctrl->cntlid, req->sq->qid,
req->sq->dhchap_status, req->sq->dhchap_step);
if (status)
pr_debug("%s: ctrl %d qid %d nvme status %x error loc %d\n",
__func__, ctrl->cntlid, req->sq->qid,
status, req->error_loc);
req->cqe->result.u64 = 0;
nvmet_req_complete(req, status);
if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 &&
req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120;
mod_delayed_work(system_wq, &req->sq->auth_expired_work,
auth_expire_secs * HZ);
return;
}
/* Final states, clear up variables */
nvmet_auth_sq_free(req->sq);
if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2)
nvmet_ctrl_fatal_error(ctrl);
}
static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al)
{
struct nvmf_auth_dhchap_challenge_data *data = d;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
int ret = 0;
int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id);
int data_size = sizeof(*d) + hash_len;
if (ctrl->dh_tfm)
data_size += ctrl->dh_keysize;
if (al < data_size) {
pr_debug("%s: buffer too small (al %d need %d)\n", __func__,
al, data_size);
return -EINVAL;
}
memset(data, 0, data_size);
req->sq->dhchap_s1 = nvme_auth_get_seqnum();
data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE;
data->t_id = cpu_to_le16(req->sq->dhchap_tid);
data->hashid = ctrl->shash_id;
data->hl = hash_len;
data->seqnum = cpu_to_le32(req->sq->dhchap_s1);
req->sq->dhchap_c1 = kmalloc(data->hl, GFP_KERNEL);
if (!req->sq->dhchap_c1)
return -ENOMEM;
get_random_bytes(req->sq->dhchap_c1, data->hl);
memcpy(data->cval, req->sq->dhchap_c1, data->hl);
if (ctrl->dh_tfm) {
data->dhgid = ctrl->dh_gid;
data->dhvlen = cpu_to_le16(ctrl->dh_keysize);
ret = nvmet_auth_ctrl_exponential(req, data->cval + data->hl,
ctrl->dh_keysize);
}
pr_debug("%s: ctrl %d qid %d seq %d transaction %d hl %d dhvlen %zu\n",
__func__, ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid, data->hl, ctrl->dh_keysize);
return ret;
}
static int nvmet_auth_success1(struct nvmet_req *req, void *d, int al)
{
struct nvmf_auth_dhchap_success1_data *data = d;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id);
WARN_ON(al < sizeof(*data));
memset(data, 0, sizeof(*data));
data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
data->t_id = cpu_to_le16(req->sq->dhchap_tid);
data->hl = hash_len;
if (req->sq->dhchap_c2) {
if (!ctrl->ctrl_key) {
pr_warn("ctrl %d qid %d no ctrl key\n",
ctrl->cntlid, req->sq->qid);
return NVME_AUTH_DHCHAP_FAILURE_FAILED;
}
if (nvmet_auth_ctrl_hash(req, data->rval, data->hl))
return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
data->rvalid = 1;
pr_debug("ctrl %d qid %d response %*ph\n",
ctrl->cntlid, req->sq->qid, data->hl, data->rval);
}
return 0;
}
static void nvmet_auth_failure1(struct nvmet_req *req, void *d, int al)
{
struct nvmf_auth_dhchap_failure_data *data = d;
WARN_ON(al < sizeof(*data));
data->auth_type = NVME_AUTH_COMMON_MESSAGES;
data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
data->t_id = cpu_to_le16(req->sq->dhchap_tid);
data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
data->rescode_exp = req->sq->dhchap_status;
}
void nvmet_execute_auth_receive(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
void *d;
u32 al;
u16 status = 0;
if (req->cmd->auth_receive.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_receive_command, secp);
goto done;
}
if (req->cmd->auth_receive.spsp0 != 0x01) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_receive_command, spsp0);
goto done;
}
if (req->cmd->auth_receive.spsp1 != 0x01) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_receive_command, spsp1);
goto done;
}
al = le32_to_cpu(req->cmd->auth_receive.al);
if (!al) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
req->error_loc =
offsetof(struct nvmf_auth_receive_command, al);
goto done;
}
if (!nvmet_check_transfer_len(req, al)) {
pr_debug("%s: transfer length mismatch (%u)\n", __func__, al);
return;
}
d = kmalloc(al, GFP_KERNEL);
if (!d) {
status = NVME_SC_INTERNAL;
goto done;
}
pr_debug("%s: ctrl %d qid %d step %x\n", __func__,
ctrl->cntlid, req->sq->qid, req->sq->dhchap_step);
switch (req->sq->dhchap_step) {
case NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE:
if (nvmet_auth_challenge(req, d, al) < 0) {
pr_warn("ctrl %d qid %d: challenge error (%d)\n",
ctrl->cntlid, req->sq->qid, status);
status = NVME_SC_INTERNAL;
break;
}
if (status) {
req->sq->dhchap_status = status;
nvmet_auth_failure1(req, d, al);
pr_warn("ctrl %d qid %d: challenge status (%x)\n",
ctrl->cntlid, req->sq->qid,
req->sq->dhchap_status);
status = 0;
break;
}
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
break;
case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1:
status = nvmet_auth_success1(req, d, al);
if (status) {
req->sq->dhchap_status = status;
req->sq->authenticated = false;
nvmet_auth_failure1(req, d, al);
pr_warn("ctrl %d qid %d: success1 status (%x)\n",
ctrl->cntlid, req->sq->qid,
req->sq->dhchap_status);
break;
}
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
break;
case NVME_AUTH_DHCHAP_MESSAGE_FAILURE1:
req->sq->authenticated = false;
nvmet_auth_failure1(req, d, al);
pr_warn("ctrl %d qid %d failure1 (%x)\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_status);
break;
default:
pr_warn("ctrl %d qid %d unhandled step (%d)\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_step);
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1;
req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
nvmet_auth_failure1(req, d, al);
status = 0;
break;
}
status = nvmet_copy_to_sgl(req, 0, d, al);
kfree(d);
done:
req->cqe->result.u64 = 0;
nvmet_req_complete(req, status);
if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2)
nvmet_auth_sq_free(req->sq);
else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
nvmet_auth_sq_free(req->sq);
nvmet_ctrl_fatal_error(ctrl);
}
}

View File

@ -82,7 +82,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@ -93,6 +93,37 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
case nvme_fabrics_type_property_get:
req->execute = nvmet_execute_prop_get;
break;
#ifdef CONFIG_NVME_TARGET_AUTH
case nvme_fabrics_type_auth_send:
req->execute = nvmet_execute_auth_send;
break;
case nvme_fabrics_type_auth_receive:
req->execute = nvmet_execute_auth_receive;
break;
#endif
default:
pr_debug("received unknown capsule type 0x%x\n",
cmd->fabrics.fctype);
req->error_loc = offsetof(struct nvmf_common_command, fctype);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
return 0;
}
u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
switch (cmd->fabrics.fctype) {
#ifdef CONFIG_NVME_TARGET_AUTH
case nvme_fabrics_type_auth_send:
req->execute = nvmet_execute_auth_send;
break;
case nvme_fabrics_type_auth_receive:
req->execute = nvmet_execute_auth_receive;
break;
#endif
default:
pr_debug("received unknown capsule type 0x%x\n",
cmd->fabrics.fctype);
@ -173,6 +204,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
struct nvmf_connect_data *d;
struct nvmet_ctrl *ctrl = NULL;
u16 status = 0;
int ret;
if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data)))
return;
@ -215,18 +247,32 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
uuid_copy(&ctrl->hostid, &d->hostid);
ret = nvmet_setup_auth(ctrl);
if (ret < 0) {
pr_err("Failed to setup authentication, error %d\n", ret);
nvmet_ctrl_put(ctrl);
if (ret == -EPERM)
status = (NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR);
else
status = NVME_SC_INTERNAL;
goto out;
}
status = nvmet_install_queue(ctrl, req);
if (status) {
nvmet_ctrl_put(ctrl);
goto out;
}
pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n",
pr_info("creating %s controller %d for subsystem %s for NQN %s%s%s.\n",
nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm",
ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
ctrl->pi_support ? " T10-PI is enabled" : "");
ctrl->pi_support ? " T10-PI is enabled" : "",
nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : "");
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
if (nvmet_has_auth(ctrl))
nvmet_init_auth(ctrl, req);
out:
kfree(d);
complete:
@ -286,6 +332,9 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
if (nvmet_has_auth(ctrl))
nvmet_init_auth(ctrl, req);
out:
kfree(d);

View File

@ -424,9 +424,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
nvme_cancel_tagset(&ctrl->ctrl);
nvme_loop_destroy_io_queues(ctrl);
}
@ -434,9 +432,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
nvme_shutdown_ctrl(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
}

View File

@ -108,6 +108,19 @@ struct nvmet_sq {
u16 size;
u32 sqhd;
bool sqhd_disabled;
#ifdef CONFIG_NVME_TARGET_AUTH
struct delayed_work auth_expired_work;
bool authenticated;
u16 dhchap_tid;
u16 dhchap_status;
int dhchap_step;
u8 *dhchap_c1;
u8 *dhchap_c2;
u32 dhchap_s1;
u32 dhchap_s2;
u8 *dhchap_skey;
int dhchap_skey_len;
#endif
struct completion free_done;
struct completion confirm_done;
};
@ -209,6 +222,15 @@ struct nvmet_ctrl {
u64 err_counter;
struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS];
bool pi_support;
#ifdef CONFIG_NVME_TARGET_AUTH
struct nvme_dhchap_key *host_key;
struct nvme_dhchap_key *ctrl_key;
u8 shash_id;
struct crypto_kpp *dh_tfm;
u8 dh_gid;
u8 *dh_key;
size_t dh_keysize;
#endif
};
struct nvmet_subsys {
@ -271,6 +293,12 @@ static inline struct nvmet_subsys *namespaces_to_subsys(
struct nvmet_host {
struct config_group group;
u8 *dhchap_secret;
u8 *dhchap_ctrl_secret;
u8 dhchap_key_hash;
u8 dhchap_ctrl_key_hash;
u8 dhchap_hash_id;
u8 dhchap_dhgroup_id;
};
static inline struct nvmet_host *to_host(struct config_item *item)
@ -420,7 +448,8 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req);
u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
@ -668,4 +697,48 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio)
bio_put(bio);
}
#ifdef CONFIG_NVME_TARGET_AUTH
void nvmet_execute_auth_send(struct nvmet_req *req);
void nvmet_execute_auth_receive(struct nvmet_req *req);
int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
bool set_ctrl);
int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
int nvmet_setup_auth(struct nvmet_ctrl *ctrl);
void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req);
void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
void nvmet_auth_sq_free(struct nvmet_sq *sq);
int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id);
bool nvmet_check_auth_status(struct nvmet_req *req);
int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int hash_len);
int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
unsigned int hash_len);
static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
{
return ctrl->host_key != NULL;
}
int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
u8 *buf, int buf_size);
int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
u8 *buf, int buf_size);
#else
static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
{
return 0;
}
static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl,
struct nvmet_req *req) {};
static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {};
static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {};
static inline bool nvmet_check_auth_status(struct nvmet_req *req)
{
return true;
}
static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
{
return false;
}
static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
#endif
#endif /* _NVMET_H */

View File

@ -1839,7 +1839,8 @@ static int __init nvmet_tcp_init(void)
{
int ret;
nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!nvmet_tcp_wq)
return -ENOMEM;

View File

@ -1725,7 +1725,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
dasd_put_device(device);
}
/* check for for attention message */
/* check for attention message */
if (scsw_dstat(&irb->scsw) & DEV_STAT_ATTENTION) {
device = dasd_device_from_cdev_locked(cdev);
if (!IS_ERR(device)) {

View File

@ -639,6 +639,7 @@ static void dasd_diag_setup_blk_queue(struct dasd_block *block)
/* With page sized segments each segment can be translated into one idaw/tidaw */
blk_queue_max_segment_size(q, PAGE_SIZE);
blk_queue_segment_boundary(q, PAGE_SIZE - 1);
blk_queue_dma_alignment(q, PAGE_SIZE - 1);
}
static int dasd_diag_pe_handler(struct dasd_device *device,

View File

@ -6626,6 +6626,7 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block)
/* With page sized segments each segment can be translated into one idaw/tidaw */
blk_queue_max_segment_size(q, PAGE_SIZE);
blk_queue_segment_boundary(q, PAGE_SIZE - 1);
blk_queue_dma_alignment(q, PAGE_SIZE - 1);
}
static struct ccw_driver dasd_eckd_driver = {

View File

@ -863,7 +863,7 @@ dcssblk_submit_bio(struct bio *bio)
unsigned long source_addr;
unsigned long bytes_done;
blk_queue_split(&bio);
bio = bio_split_to_limits(bio);
bytes_done = 0;
dev_info = bio->bi_bdev->bd_disk->private_data;

View File

@ -718,6 +718,8 @@ static inline void ahash_request_set_crypt(struct ahash_request *req,
struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
u32 mask);
int crypto_has_shash(const char *alg_name, u32 type, u32 mask);
static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
{
return &tfm->base;

View File

@ -104,6 +104,8 @@ struct kpp_alg {
*/
struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask);
int crypto_has_kpp(const char *alg_name, u32 type, u32 mask);
static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm)
{
return &tfm->base;

16
include/linux/base64.h Normal file
View File

@ -0,0 +1,16 @@
// SPDX-License-Identifier: GPL-2.0
/*
* base64 encoding, lifted from fs/crypto/fname.c.
*/
#ifndef _LINUX_BASE64_H
#define _LINUX_BASE64_H
#include <linux/types.h>
#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
int base64_encode(const u8 *src, int len, char *dst);
int base64_decode(const char *src, int len, u8 *dst);
#endif /* _LINUX_BASE64_H */

View File

@ -140,6 +140,8 @@ struct gendisk {
struct request_queue *queue;
void *private_data;
struct bio_set bio_split;
int flags;
unsigned long state;
#define GD_NEED_PART_SCAN 0
@ -531,7 +533,6 @@ struct request_queue {
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
struct bio_set bio_split;
struct dentry *debugfs_dir;
struct dentry *sched_debugfs_dir;
@ -864,9 +865,9 @@ void blk_request_module(dev_t devt);
extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
void submit_bio_noacct(struct bio *bio);
struct bio *bio_split_to_limits(struct bio *bio);
extern int blk_lld_busy(struct request_queue *q);
extern void blk_queue_split(struct bio **);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);

41
include/linux/nvme-auth.h Normal file
View File

@ -0,0 +1,41 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions
*/
#ifndef _NVME_AUTH_H
#define _NVME_AUTH_H
#include <crypto/kpp.h>
struct nvme_dhchap_key {
u8 *key;
size_t len;
u8 hash;
};
u32 nvme_auth_get_seqnum(void);
const char *nvme_auth_dhgroup_name(u8 dhgroup_id);
const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id);
u8 nvme_auth_dhgroup_id(const char *dhgroup_name);
const char *nvme_auth_hmac_name(u8 hmac_id);
const char *nvme_auth_digest_name(u8 hmac_id);
size_t nvme_auth_hmac_hash_len(u8 hmac_id);
u8 nvme_auth_hmac_id(const char *hmac_name);
struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
u8 key_hash);
void nvme_auth_free_key(struct nvme_dhchap_key *key);
u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
u8 *challenge, u8 *aug, size_t hlen);
int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid);
int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
u8 *host_key, size_t host_key_len);
int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
u8 *ctrl_key, size_t ctrl_key_len,
u8 *sess_key, size_t sess_key_len);
#endif /* _NVME_AUTH_H */

View File

@ -19,6 +19,7 @@
#define NVMF_TRSVCID_SIZE 32
#define NVMF_TRADDR_SIZE 256
#define NVMF_TSAS_SIZE 256
#define NVMF_AUTH_HASH_LEN 64
#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery"
@ -711,6 +712,10 @@ enum {
NVME_AER_VS = 7,
};
enum {
NVME_AER_ERROR_PERSIST_INT_ERR = 0x03,
};
enum {
NVME_AER_NOTICE_NS_CHANGED = 0x00,
NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
@ -1369,6 +1374,8 @@ enum nvmf_capsule_command {
nvme_fabrics_type_property_set = 0x00,
nvme_fabrics_type_connect = 0x01,
nvme_fabrics_type_property_get = 0x04,
nvme_fabrics_type_auth_send = 0x05,
nvme_fabrics_type_auth_receive = 0x06,
};
#define nvme_fabrics_type_name(type) { type, #type }
@ -1376,7 +1383,9 @@ enum nvmf_capsule_command {
__print_symbolic(type, \
nvme_fabrics_type_name(nvme_fabrics_type_property_set), \
nvme_fabrics_type_name(nvme_fabrics_type_connect), \
nvme_fabrics_type_name(nvme_fabrics_type_property_get))
nvme_fabrics_type_name(nvme_fabrics_type_property_get), \
nvme_fabrics_type_name(nvme_fabrics_type_auth_send), \
nvme_fabrics_type_name(nvme_fabrics_type_auth_receive))
/*
* If not fabrics command, fctype will be ignored.
@ -1472,6 +1481,11 @@ struct nvmf_connect_command {
__u8 resv4[12];
};
enum {
NVME_CONNECT_AUTHREQ_ASCR = (1 << 2),
NVME_CONNECT_AUTHREQ_ATR = (1 << 1),
};
struct nvmf_connect_data {
uuid_t hostid;
__le16 cntlid;
@ -1506,6 +1520,200 @@ struct nvmf_property_get_command {
__u8 resv4[16];
};
struct nvmf_auth_common_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[19];
union nvme_data_ptr dptr;
__u8 resv3;
__u8 spsp0;
__u8 spsp1;
__u8 secp;
__le32 al_tl;
__u8 resv4[16];
};
struct nvmf_auth_send_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[19];
union nvme_data_ptr dptr;
__u8 resv3;
__u8 spsp0;
__u8 spsp1;
__u8 secp;
__le32 tl;
__u8 resv4[16];
};
struct nvmf_auth_receive_command {
__u8 opcode;
__u8 resv1;
__u16 command_id;
__u8 fctype;
__u8 resv2[19];
union nvme_data_ptr dptr;
__u8 resv3;
__u8 spsp0;
__u8 spsp1;
__u8 secp;
__le32 al;
__u8 resv4[16];
};
/* Value for secp */
enum {
NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER = 0xe9,
};
/* Defined value for auth_type */
enum {
NVME_AUTH_COMMON_MESSAGES = 0x00,
NVME_AUTH_DHCHAP_MESSAGES = 0x01,
};
/* Defined messages for auth_id */
enum {
NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE = 0x00,
NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE = 0x01,
NVME_AUTH_DHCHAP_MESSAGE_REPLY = 0x02,
NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1 = 0x03,
NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 = 0x04,
NVME_AUTH_DHCHAP_MESSAGE_FAILURE2 = 0xf0,
NVME_AUTH_DHCHAP_MESSAGE_FAILURE1 = 0xf1,
};
struct nvmf_auth_dhchap_protocol_descriptor {
__u8 authid;
__u8 rsvd;
__u8 halen;
__u8 dhlen;
__u8 idlist[60];
};
enum {
NVME_AUTH_DHCHAP_AUTH_ID = 0x01,
};
/* Defined hash functions for DH-HMAC-CHAP authentication */
enum {
NVME_AUTH_HASH_SHA256 = 0x01,
NVME_AUTH_HASH_SHA384 = 0x02,
NVME_AUTH_HASH_SHA512 = 0x03,
NVME_AUTH_HASH_INVALID = 0xff,
};
/* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */
enum {
NVME_AUTH_DHGROUP_NULL = 0x00,
NVME_AUTH_DHGROUP_2048 = 0x01,
NVME_AUTH_DHGROUP_3072 = 0x02,
NVME_AUTH_DHGROUP_4096 = 0x03,
NVME_AUTH_DHGROUP_6144 = 0x04,
NVME_AUTH_DHGROUP_8192 = 0x05,
NVME_AUTH_DHGROUP_INVALID = 0xff,
};
union nvmf_auth_protocol {
struct nvmf_auth_dhchap_protocol_descriptor dhchap;
};
struct nvmf_auth_dhchap_negotiate_data {
__u8 auth_type;
__u8 auth_id;
__le16 rsvd;
__le16 t_id;
__u8 sc_c;
__u8 napd;
union nvmf_auth_protocol auth_protocol[];
};
struct nvmf_auth_dhchap_challenge_data {
__u8 auth_type;
__u8 auth_id;
__u16 rsvd1;
__le16 t_id;
__u8 hl;
__u8 rsvd2;
__u8 hashid;
__u8 dhgid;
__le16 dhvlen;
__le32 seqnum;
/* 'hl' bytes of challenge value */
__u8 cval[];
/* followed by 'dhvlen' bytes of DH value */
};
struct nvmf_auth_dhchap_reply_data {
__u8 auth_type;
__u8 auth_id;
__le16 rsvd1;
__le16 t_id;
__u8 hl;
__u8 rsvd2;
__u8 cvalid;
__u8 rsvd3;
__le16 dhvlen;
__le32 seqnum;
/* 'hl' bytes of response data */
__u8 rval[];
/* followed by 'hl' bytes of Challenge value */
/* followed by 'dhvlen' bytes of DH value */
};
enum {
NVME_AUTH_DHCHAP_RESPONSE_VALID = (1 << 0),
};
struct nvmf_auth_dhchap_success1_data {
__u8 auth_type;
__u8 auth_id;
__le16 rsvd1;
__le16 t_id;
__u8 hl;
__u8 rsvd2;
__u8 rvalid;
__u8 rsvd3[7];
/* 'hl' bytes of response value if 'rvalid' is set */
__u8 rval[];
};
struct nvmf_auth_dhchap_success2_data {
__u8 auth_type;
__u8 auth_id;
__le16 rsvd1;
__le16 t_id;
__u8 rsvd2[10];
};
struct nvmf_auth_dhchap_failure_data {
__u8 auth_type;
__u8 auth_id;
__le16 rsvd1;
__le16 t_id;
__u8 rescode;
__u8 rescode_exp;
};
enum {
NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED = 0x01,
};
enum {
NVME_AUTH_DHCHAP_FAILURE_FAILED = 0x01,
NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE = 0x02,
NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH = 0x03,
NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE = 0x04,
NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE = 0x05,
NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD = 0x06,
NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE = 0x07,
};
struct nvme_dbbuf {
__u8 opcode;
__u8 flags;
@ -1549,6 +1757,9 @@ struct nvme_command {
struct nvmf_connect_command connect;
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
struct nvmf_auth_common_command auth_common;
struct nvmf_auth_send_command auth_send;
struct nvmf_auth_receive_command auth_receive;
struct nvme_dbbuf dbbuf;
struct nvme_directive_cmd directive;
};

View File

@ -15,6 +15,8 @@
#define UBLK_CMD_DEL_DEV 0x05
#define UBLK_CMD_START_DEV 0x06
#define UBLK_CMD_STOP_DEV 0x07
#define UBLK_CMD_SET_PARAMS 0x08
#define UBLK_CMD_GET_PARAMS 0x09
/*
* IO commands, issued by ublk server, and handled by ublk driver.
@ -28,12 +30,21 @@
* this IO request, request's handling result is committed to ublk
* driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
* handled before completing io request.
*
* NEED_GET_DATA: only used for write requests to set io addr and copy data
* When NEED_GET_DATA is set, ublksrv has to issue UBLK_IO_NEED_GET_DATA
* command after ublk driver returns UBLK_IO_RES_NEED_GET_DATA.
*
* It is only used if ublksrv set UBLK_F_NEED_GET_DATA flag
* while starting a ublk device.
*/
#define UBLK_IO_FETCH_REQ 0x20
#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21
#define UBLK_IO_NEED_GET_DATA 0x22
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
#define UBLK_IO_RES_NEED_GET_DATA 1
#define UBLK_IO_RES_ABORT (-ENODEV)
#define UBLKSRV_CMD_BUF_OFFSET 0
@ -54,6 +65,15 @@
*/
#define UBLK_F_URING_CMD_COMP_IN_TASK (1ULL << 1)
/*
* User should issue io cmd again for write requests to
* set io buffer address and copy data from bio vectors
* to the userspace io buffer.
*
* In this mode, task_work is not used.
*/
#define UBLK_F_NEED_GET_DATA (1UL << 2)
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@ -78,22 +98,23 @@ struct ublksrv_ctrl_cmd {
struct ublksrv_ctrl_dev_info {
__u16 nr_hw_queues;
__u16 queue_depth;
__u16 block_size;
__u16 state;
__u16 pad0;
__u32 rq_max_blocks;
__u32 max_io_buf_bytes;
__u32 dev_id;
__u64 dev_blocks;
__s32 ublksrv_pid;
__s32 reserved0;
__u32 pad1;
__u64 flags;
__u64 flags_reserved;
/* For ublksrv internal use, invisible to ublk driver */
__u64 ublksrv_flags;
__u64 reserved1[9];
__u64 reserved0;
__u64 reserved1;
__u64 reserved2;
};
#define UBLK_IO_OP_READ 0
@ -158,4 +179,49 @@ struct ublksrv_io_cmd {
__u64 addr;
};
struct ublk_param_basic {
#define UBLK_ATTR_READ_ONLY (1 << 0)
#define UBLK_ATTR_ROTATIONAL (1 << 1)
#define UBLK_ATTR_VOLATILE_CACHE (1 << 2)
#define UBLK_ATTR_FUA (1 << 3)
__u32 attrs;
__u8 logical_bs_shift;
__u8 physical_bs_shift;
__u8 io_opt_shift;
__u8 io_min_shift;
__u32 max_sectors;
__u32 chunk_sectors;
__u64 dev_sectors;
__u64 virt_boundary_mask;
};
struct ublk_param_discard {
__u32 discard_alignment;
__u32 discard_granularity;
__u32 max_discard_sectors;
__u32 max_write_zeroes_sectors;
__u16 max_discard_segments;
__u16 reserved0;
};
struct ublk_params {
/*
* Total length of parameters, userspace has to set 'len' for both
* SET_PARAMS and GET_PARAMS command, and driver may update len
* if two sides use different version of 'ublk_params', same with
* 'types' fields.
*/
__u32 len;
#define UBLK_PARAM_TYPE_BASIC (1 << 0)
#define UBLK_PARAM_TYPE_DISCARD (1 << 1)
__u32 types; /* types of parameter included */
struct ublk_param_basic basic;
struct ublk_param_discard discard;
};
#endif

View File

@ -46,7 +46,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
list_sort.o uuid.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o rhashtable.o \
percpu-refcount.o rhashtable.o base64.o \
once.o refcount.o usercopy.o errseq.o bucket_locks.o \
generic-radix-tree.o
obj-$(CONFIG_STRING_SELFTEST) += test_string.o

103
lib/base64.c Normal file
View File

@ -0,0 +1,103 @@
// SPDX-License-Identifier: GPL-2.0
/*
* base64.c - RFC4648-compliant base64 encoding
*
* Copyright (c) 2020 Hannes Reinecke, SUSE
*
* Based on the base64url routines from fs/crypto/fname.c
* (which are using the URL-safe base64 encoding),
* modified to use the standard coding table from RFC4648 section 4.
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/base64.h>
static const char base64_table[65] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
/**
* base64_encode() - base64-encode some binary data
* @src: the binary data to encode
* @srclen: the length of @src in bytes
* @dst: (output) the base64-encoded string. Not NUL-terminated.
*
* Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified
* by RFC 4648, including the '='-padding.
*
* Return: the length of the resulting base64-encoded string in bytes.
*/
int base64_encode(const u8 *src, int srclen, char *dst)
{
u32 ac = 0;
int bits = 0;
int i;
char *cp = dst;
for (i = 0; i < srclen; i++) {
ac = (ac << 8) | src[i];
bits += 8;
do {
bits -= 6;
*cp++ = base64_table[(ac >> bits) & 0x3f];
} while (bits >= 6);
}
if (bits) {
*cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
bits -= 6;
}
while (bits < 0) {
*cp++ = '=';
bits += 2;
}
return cp - dst;
}
EXPORT_SYMBOL_GPL(base64_encode);
/**
* base64_decode() - base64-decode a string
* @src: the string to decode. Doesn't need to be NUL-terminated.
* @srclen: the length of @src in bytes
* @dst: (output) the decoded binary data
*
* Decodes a string using base64 encoding, i.e. the "Base 64 Encoding"
* specified by RFC 4648, including the '='-padding.
*
* This implementation hasn't been optimized for performance.
*
* Return: the length of the resulting decoded binary data in bytes,
* or -1 if the string isn't a valid base64 string.
*/
int base64_decode(const char *src, int srclen, u8 *dst)
{
u32 ac = 0;
int bits = 0;
int i;
u8 *bp = dst;
for (i = 0; i < srclen; i++) {
const char *p = strchr(base64_table, src[i]);
if (src[i] == '=') {
ac = (ac << 6);
bits += 6;
if (bits >= 8)
bits -= 8;
continue;
}
if (p == NULL || src[i] == 0)
return -1;
ac = (ac << 6) | (p - base64_table);
bits += 6;
if (bits >= 8) {
bits -= 8;
*bp++ = (u8)(ac >> bits);
}
}
if (ac & ((1 << bits) - 1))
return -1;
return bp - dst;
}
EXPORT_SYMBOL_GPL(base64_decode);