v6.2 second rc pull request

- Several hfi1 patches fixing some long standing driver bugs
 
 - Overflow when working with sg lists with elements greater than 4G
 
 - An rxe regression with object numbering after the mrs reach their limit
 
 - A theoretical problem with the scatterlist merging code
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQRRRCHOFoQz/8F5bUaFwuHvBreFYQUCY8sBzAAKCRCFwuHvBreF
 YWMBAP92rk8L3oLrNbYTryNTv8w/LicLhmAvhC42KRy8klvFkAD6A+wskhxmHMgO
 aSdznob6peMEyNONZUKcijqjnSXhyAY=
 =brq4
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma fixes from Jason Gunthorpe:

 - Several hfi1 patches fixing some long standing driver bugs

 - Overflow when working with sg lists with elements greater than 4G

 - An rxe regression with object numbering after the mrs reach their
   limit

 - A theoretical problem with the scatterlist merging code

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  lib/scatterlist: Fix to calculate the last_pg properly
  IB/hfi1: Remove user expected buffer invalidate race
  IB/hfi1: Immediately remove invalid memory from hardware
  IB/hfi1: Fix expected receive setup error exit issues
  IB/hfi1: Reserve user expected TIDs
  IB/hfi1: Reject a zero-length user expected buffer
  RDMA/core: Fix ib block iterator counter overflow
  RDMA/rxe: Prevent faulty rkey generation
  RDMA/rxe: Fix inaccurate constants in rxe_type_info
This commit is contained in:
Linus Torvalds 2023-01-20 14:15:51 -08:00
commit 8974efaa33
6 changed files with 180 additions and 89 deletions

View File

@ -2957,15 +2957,18 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
bool __rdma_block_iter_next(struct ib_block_iter *biter)
{
unsigned int block_offset;
unsigned int sg_delta;
if (!biter->__sg_nents || !biter->__sg)
return false;
biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
biter->__sg_advance += sg_delta;
} else {
biter->__sg_advance = 0;
biter->__sg = sg_next(biter->__sg);
biter->__sg_nents--;

View File

@ -23,18 +23,25 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
struct tid_group *grp,
unsigned int start, u16 count,
u32 *tidlist, unsigned int *tididx,
unsigned int *pmapped);
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
struct tid_group **grp);
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
static void __clear_tid_node(struct hfi1_filedata *fd,
struct tid_rb_node *node);
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
static const struct mmu_interval_notifier_ops tid_mn_ops = {
.invalidate = tid_rb_invalidate,
};
static const struct mmu_interval_notifier_ops tid_cover_ops = {
.invalidate = tid_cover_invalidate,
};
/*
* Initialize context and file private data needed for Expected
@ -253,53 +260,65 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
tididx = 0, mapped, mapped_pages = 0;
u32 *tidlist = NULL;
struct tid_user_buf *tidbuf;
unsigned long mmu_seq = 0;
if (!PAGE_ALIGNED(tinfo->vaddr))
return -EINVAL;
if (tinfo->length == 0)
return -EINVAL;
tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
if (!tidbuf)
return -ENOMEM;
mutex_init(&tidbuf->cover_mutex);
tidbuf->vaddr = tinfo->vaddr;
tidbuf->length = tinfo->length;
tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
GFP_KERNEL);
if (!tidbuf->psets) {
kfree(tidbuf);
return -ENOMEM;
ret = -ENOMEM;
goto fail_release_mem;
}
if (fd->use_mn) {
ret = mmu_interval_notifier_insert(
&tidbuf->notifier, current->mm,
tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
&tid_cover_ops);
if (ret)
goto fail_release_mem;
mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
}
pinned = pin_rcv_pages(fd, tidbuf);
if (pinned <= 0) {
kfree(tidbuf->psets);
kfree(tidbuf);
return pinned;
ret = (pinned < 0) ? pinned : -ENOSPC;
goto fail_unpin;
}
/* Find sets of physically contiguous pages */
tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
/*
* We don't need to access this under a lock since tid_used is per
* process and the same process cannot be in hfi1_user_exp_rcv_clear()
* and hfi1_user_exp_rcv_setup() at the same time.
*/
/* Reserve the number of expected tids to be used. */
spin_lock(&fd->tid_lock);
if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
pageset_count = fd->tid_limit - fd->tid_used;
else
pageset_count = tidbuf->n_psets;
fd->tid_used += pageset_count;
spin_unlock(&fd->tid_lock);
if (!pageset_count)
goto bail;
if (!pageset_count) {
ret = -ENOSPC;
goto fail_unreserve;
}
ngroups = pageset_count / dd->rcv_entries.group_size;
tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
if (!tidlist) {
ret = -ENOMEM;
goto nomem;
goto fail_unreserve;
}
tididx = 0;
@ -395,43 +414,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
}
unlock:
mutex_unlock(&uctxt->exp_mutex);
nomem:
hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
mapped_pages, ret);
if (tididx) {
spin_lock(&fd->tid_lock);
fd->tid_used += tididx;
spin_unlock(&fd->tid_lock);
tinfo->tidcnt = tididx;
tinfo->length = mapped_pages * PAGE_SIZE;
if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
tidlist, sizeof(tidlist[0]) * tididx)) {
/*
* On failure to copy to the user level, we need to undo
* everything done so far so we don't leak resources.
*/
tinfo->tidlist = (unsigned long)&tidlist;
hfi1_user_exp_rcv_clear(fd, tinfo);
tinfo->tidlist = 0;
ret = -EFAULT;
goto bail;
/* fail if nothing was programmed, set error if none provided */
if (tididx == 0) {
if (ret >= 0)
ret = -ENOSPC;
goto fail_unreserve;
}
/* adjust reserved tid_used to actual count */
spin_lock(&fd->tid_lock);
fd->tid_used -= pageset_count - tididx;
spin_unlock(&fd->tid_lock);
/* unpin all pages not covered by a TID */
unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
false);
if (fd->use_mn) {
/* check for an invalidate during setup */
bool fail = false;
mutex_lock(&tidbuf->cover_mutex);
fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
mutex_unlock(&tidbuf->cover_mutex);
if (fail) {
ret = -EBUSY;
goto fail_unprogram;
}
}
/*
* If not everything was mapped (due to insufficient RcvArray entries,
* for example), unpin all unmapped pages so we can pin them nex time.
*/
if (mapped_pages != pinned)
unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
(pinned - mapped_pages), false);
bail:
kfree(tidbuf->psets);
kfree(tidlist);
tinfo->tidcnt = tididx;
tinfo->length = mapped_pages * PAGE_SIZE;
if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
tidlist, sizeof(tidlist[0]) * tididx)) {
ret = -EFAULT;
goto fail_unprogram;
}
if (fd->use_mn)
mmu_interval_notifier_remove(&tidbuf->notifier);
kfree(tidbuf->pages);
kfree(tidbuf->psets);
kfree(tidbuf);
return ret > 0 ? 0 : ret;
kfree(tidlist);
return 0;
fail_unprogram:
/* unprogram, unmap, and unpin all allocated TIDs */
tinfo->tidlist = (unsigned long)tidlist;
hfi1_user_exp_rcv_clear(fd, tinfo);
tinfo->tidlist = 0;
pinned = 0; /* nothing left to unpin */
pageset_count = 0; /* nothing left reserved */
fail_unreserve:
spin_lock(&fd->tid_lock);
fd->tid_used -= pageset_count;
spin_unlock(&fd->tid_lock);
fail_unpin:
if (fd->use_mn)
mmu_interval_notifier_remove(&tidbuf->notifier);
if (pinned > 0)
unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
fail_release_mem:
kfree(tidbuf->pages);
kfree(tidbuf->psets);
kfree(tidbuf);
kfree(tidlist);
return ret;
}
int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
@ -452,7 +506,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
mutex_lock(&uctxt->exp_mutex);
for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
ret = unprogram_rcvarray(fd, tidinfo[tididx]);
if (ret) {
hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
ret);
@ -706,6 +760,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
}
node->fdata = fd;
mutex_init(&node->invalidate_mutex);
node->phys = page_to_phys(pages[0]);
node->npages = npages;
node->rcventry = rcventry;
@ -721,11 +776,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
&tid_mn_ops);
if (ret)
goto out_unmap;
/*
* FIXME: This is in the wrong order, the notifier should be
* established before the pages are pinned by pin_rcv_pages.
*/
mmu_interval_read_begin(&node->notifier);
}
fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
@ -745,8 +795,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
return -EFAULT;
}
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
struct tid_group **grp)
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
@ -769,9 +818,6 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
if (!node || node->rcventry != (uctxt->expected_base + rcventry))
return -EBADF;
if (grp)
*grp = node->grp;
if (fd->use_mn)
mmu_interval_notifier_remove(&node->notifier);
cacheless_tid_rb_remove(fd, node);
@ -779,23 +825,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
return 0;
}
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
mutex_lock(&node->invalidate_mutex);
if (node->freed)
goto done;
node->freed = true;
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
node->npages,
node->notifier.interval_tree.start, node->phys,
node->dma_addr);
/*
* Make sure device has seen the write before we unpin the
* pages.
*/
/* Make sure device has seen the write before pages are unpinned */
hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
done:
mutex_unlock(&node->invalidate_mutex);
}
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
__clear_tid_node(fd, node);
node->grp->used--;
node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
@ -854,10 +911,16 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
if (node->freed)
return true;
/* take action only if unmapping */
if (range->event != MMU_NOTIFY_UNMAP)
return true;
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
node->notifier.interval_tree.start,
node->rcventry, node->npages, node->dma_addr);
node->freed = true;
/* clear the hardware rcvarray entry */
__clear_tid_node(fdata, node);
spin_lock(&fdata->invalid_lock);
if (fdata->invalid_tid_idx < uctxt->expected_count) {
@ -887,6 +950,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
return true;
}
static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
struct tid_user_buf *tidbuf =
container_of(mni, struct tid_user_buf, notifier);
/* take action only if unmapping */
if (range->event == MMU_NOTIFY_UNMAP) {
mutex_lock(&tidbuf->cover_mutex);
mmu_interval_set_seq(mni, cur_seq);
mutex_unlock(&tidbuf->cover_mutex);
}
return true;
}
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
struct tid_rb_node *tnode)
{

View File

@ -16,6 +16,8 @@ struct tid_pageset {
};
struct tid_user_buf {
struct mmu_interval_notifier notifier;
struct mutex cover_mutex;
unsigned long vaddr;
unsigned long length;
unsigned int npages;
@ -27,6 +29,7 @@ struct tid_user_buf {
struct tid_rb_node {
struct mmu_interval_notifier notifier;
struct hfi1_filedata *fdata;
struct mutex invalidate_mutex; /* covers hw removal */
unsigned long phys;
struct tid_group *grp;
u32 rcventry;

View File

@ -98,11 +98,11 @@ enum rxe_device_param {
RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX,
RXE_MIN_MR_INDEX = 0x00000001,
RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE,
RXE_MAX_MR = DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX,
RXE_MIN_MW_INDEX = 0x00010001,
RXE_MAX_MW_INDEX = 0x00020000,
RXE_MAX_MW = 0x00001000,
RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1,
RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX,
RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1,
RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE,
RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX,
RXE_MAX_PKT_PER_ACK = 64,

View File

@ -23,16 +23,16 @@ static const struct rxe_type_info {
.size = sizeof(struct rxe_ucontext),
.elem_offset = offsetof(struct rxe_ucontext, elem),
.min_index = 1,
.max_index = UINT_MAX,
.max_elem = UINT_MAX,
.max_index = RXE_MAX_UCONTEXT,
.max_elem = RXE_MAX_UCONTEXT,
},
[RXE_TYPE_PD] = {
.name = "pd",
.size = sizeof(struct rxe_pd),
.elem_offset = offsetof(struct rxe_pd, elem),
.min_index = 1,
.max_index = UINT_MAX,
.max_elem = UINT_MAX,
.max_index = RXE_MAX_PD,
.max_elem = RXE_MAX_PD,
},
[RXE_TYPE_AH] = {
.name = "ah",
@ -40,7 +40,7 @@ static const struct rxe_type_info {
.elem_offset = offsetof(struct rxe_ah, elem),
.min_index = RXE_MIN_AH_INDEX,
.max_index = RXE_MAX_AH_INDEX,
.max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1,
.max_elem = RXE_MAX_AH,
},
[RXE_TYPE_SRQ] = {
.name = "srq",
@ -49,7 +49,7 @@ static const struct rxe_type_info {
.cleanup = rxe_srq_cleanup,
.min_index = RXE_MIN_SRQ_INDEX,
.max_index = RXE_MAX_SRQ_INDEX,
.max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1,
.max_elem = RXE_MAX_SRQ,
},
[RXE_TYPE_QP] = {
.name = "qp",
@ -58,7 +58,7 @@ static const struct rxe_type_info {
.cleanup = rxe_qp_cleanup,
.min_index = RXE_MIN_QP_INDEX,
.max_index = RXE_MAX_QP_INDEX,
.max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1,
.max_elem = RXE_MAX_QP,
},
[RXE_TYPE_CQ] = {
.name = "cq",
@ -66,8 +66,8 @@ static const struct rxe_type_info {
.elem_offset = offsetof(struct rxe_cq, elem),
.cleanup = rxe_cq_cleanup,
.min_index = 1,
.max_index = UINT_MAX,
.max_elem = UINT_MAX,
.max_index = RXE_MAX_CQ,
.max_elem = RXE_MAX_CQ,
},
[RXE_TYPE_MR] = {
.name = "mr",
@ -76,7 +76,7 @@ static const struct rxe_type_info {
.cleanup = rxe_mr_cleanup,
.min_index = RXE_MIN_MR_INDEX,
.max_index = RXE_MAX_MR_INDEX,
.max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1,
.max_elem = RXE_MAX_MR,
},
[RXE_TYPE_MW] = {
.name = "mw",
@ -85,7 +85,7 @@ static const struct rxe_type_info {
.cleanup = rxe_mw_cleanup,
.min_index = RXE_MIN_MW_INDEX,
.max_index = RXE_MAX_MW_INDEX,
.max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1,
.max_elem = RXE_MAX_MW,
},
};

View File

@ -470,22 +470,27 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
return -EOPNOTSUPP;
if (sgt_append->prv) {
unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) +
sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE;
if (WARN_ON(offset))
return -EINVAL;
/* Merge contiguous pages into the last SG */
prv_len = sgt_append->prv->length;
last_pg = sg_page(sgt_append->prv);
while (n_pages && pages_are_mergeable(pages[0], last_pg)) {
if (sgt_append->prv->length + PAGE_SIZE > max_segment)
break;
sgt_append->prv->length += PAGE_SIZE;
last_pg = pages[0];
pages++;
n_pages--;
if (page_to_pfn(pages[0]) == next_pfn) {
last_pg = pfn_to_page(next_pfn - 1);
while (n_pages && pages_are_mergeable(pages[0], last_pg)) {
if (sgt_append->prv->length + PAGE_SIZE > max_segment)
break;
sgt_append->prv->length += PAGE_SIZE;
last_pg = pages[0];
pages++;
n_pages--;
}
if (!n_pages)
goto out;
}
if (!n_pages)
goto out;
}
/* compute number of contiguous chunks */