Use ODP MRs for kernel ULPs

The following series extends MR creation routines to allow creation of
 user MRs through kernel ULPs as a proxy. The immediate use case is to
 allow RDS to work over FS-DAX, which requires ODP (on-demand-paging)
 MRs to be created and such MRs were not possible to create prior this
 series.
 
 The first part of this patchset extends RDMA to have special verb
 ib_reg_user_mr(). The common use case that uses this function is a
 userspace application that allocates memory for HCA access but the
 responsibility to register the memory at the HCA is on an kernel ULP.
 This ULP acts as an agent for the userspace application.
 
 The second part provides advise MR functionality for ULPs. This is
 integral part of ODP flows and used to trigger pagefaults in advance
 to prepare memory before running working set.
 
 The third part is actual user of those in-kernel APIs.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQT1m3YD37UfMCUQBNwp8NhrnBAZsQUCXiVO8AAKCRAp8NhrnBAZ
 scTrAP9gb0d3qv0IOtHw5aGI1DAgjTUn/SzUOnsjDEn7DIoh9gEA2+ZmaEyLXKrl
 +UcZb31auy5P8ueJYokRLhLAyRcOIAg=
 =yaHb
 -----END PGP SIGNATURE-----

Merge tag 'rds-odp-for-5.5' of https://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma

Leon Romanovsky says:

====================
Use ODP MRs for kernel ULPs

The following series extends MR creation routines to allow creation of
user MRs through kernel ULPs as a proxy. The immediate use case is to
allow RDS to work over FS-DAX, which requires ODP (on-demand-paging)
MRs to be created and such MRs were not possible to create prior this
series.

The first part of this patchset extends RDMA to have special verb
ib_reg_user_mr(). The common use case that uses this function is a
userspace application that allocates memory for HCA access but the
responsibility to register the memory at the HCA is on an kernel ULP.
This ULP acts as an agent for the userspace application.

The second part provides advise MR functionality for ULPs. This is
integral part of ODP flows and used to trigger pagefaults in advance
to prepare memory before running working set.

The third part is actual user of those in-kernel APIs.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2020-01-21 10:22:51 +01:00
commit ad063075d4
45 changed files with 560 additions and 255 deletions

View File

@ -181,15 +181,14 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
/**
* ib_umem_get - Pin and DMA map userspace memory.
*
* @udata: userspace context to pin memory for
* @device: IB device to connect UMEM
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
*/
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
size_t size, int access)
{
struct ib_ucontext *context;
struct ib_umem *umem;
struct page **page_list;
unsigned long lock_limit;
@ -201,14 +200,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct scatterlist *sg;
unsigned int gup_flags = FOLL_WRITE;
if (!udata)
return ERR_PTR(-EIO);
context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
if (!context)
return ERR_PTR(-EIO);
/*
* If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error.
@ -226,7 +217,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
umem = kzalloc(sizeof(*umem), GFP_KERNEL);
if (!umem)
return ERR_PTR(-ENOMEM);
umem->ibdev = context->device;
umem->ibdev = device;
umem->length = size;
umem->address = addr;
umem->writable = ib_access_writable(access);
@ -281,7 +272,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
npages -= ret;
sg = ib_umem_add_sg_table(sg, page_list, ret,
dma_get_max_seg_size(context->device->dma_device),
dma_get_max_seg_size(device->dma_device),
&umem->sg_nents);
up_read(&mm->mmap_sem);
@ -289,10 +280,10 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
sg_mark_end(sg);
umem->nmap = ib_dma_map_sg(context->device,
umem->sg_head.sgl,
umem->sg_nents,
DMA_BIDIRECTIONAL);
umem->nmap = ib_dma_map_sg(device,
umem->sg_head.sgl,
umem->sg_nents,
DMA_BIDIRECTIONAL);
if (!umem->nmap) {
ret = -ENOMEM;
@ -303,7 +294,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
goto out;
umem_release:
__ib_umem_release(context->device, umem, 0);
__ib_umem_release(device, umem, 0);
vma:
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out:

View File

@ -110,15 +110,12 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
* They exist only to hold the per_mm reference to help the driver create
* children umems.
*
* @udata: udata from the syscall being used to create the umem
* @device: IB device to create UMEM
* @access: ib_reg_mr access flags
*/
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
int access)
{
struct ib_ucontext *context =
container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
int ret;
@ -126,14 +123,11 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
if (!context)
return ERR_PTR(-EIO);
umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
if (!umem_odp)
return ERR_PTR(-ENOMEM);
umem = &umem_odp->umem;
umem->ibdev = context->device;
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
umem_odp->is_implicit_odp = 1;
@ -201,7 +195,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
/**
* ib_umem_odp_get - Create a umem_odp for a userspace va
*
* @udata: userspace context to pin memory for
* @device: IB device struct to get UMEM
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
@ -210,23 +204,14 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
* pinning, instead, stores the mm for future page fault handling in
* conjunction with MMU notifiers.
*/
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
size_t size, int access,
struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
unsigned long addr, size_t size, int access,
const struct mmu_interval_notifier_ops *ops)
{
struct ib_umem_odp *umem_odp;
struct ib_ucontext *context;
struct mm_struct *mm;
int ret;
if (!udata)
return ERR_PTR(-EIO);
context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
if (!context)
return ERR_PTR(-EIO);
if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
return ERR_PTR(-EINVAL);
@ -234,7 +219,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
if (!umem_odp)
return ERR_PTR(-ENOMEM);
umem_odp->umem.ibdev = context->device;
umem_odp->umem.ibdev = device;
umem_odp->umem.length = size;
umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access);

View File

@ -1990,6 +1990,47 @@ EXPORT_SYMBOL(ib_resize_cq);
/* Memory regions */
struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags)
{
struct ib_mr *mr;
if (access_flags & IB_ACCESS_ON_DEMAND) {
if (!(pd->device->attrs.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
return ERR_PTR(-EINVAL);
}
}
mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
access_flags, NULL);
if (IS_ERR(mr))
return mr;
mr->device = pd->device;
mr->pd = pd;
mr->dm = NULL;
atomic_inc(&pd->usecnt);
mr->res.type = RDMA_RESTRACK_MR;
rdma_restrack_kadd(&mr->res);
return mr;
}
EXPORT_SYMBOL(ib_reg_user_mr);
int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge)
{
if (!pd->device->ops.advise_mr)
return -EOPNOTSUPP;
return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
NULL);
}
EXPORT_SYMBOL(ib_advise_mr);
int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{
struct ib_pd *pd = mr->pd;

View File

@ -837,7 +837,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
bytes += (qplib_qp->sq.max_wqe * psn_sz);
}
bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE);
umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem))
return PTR_ERR(umem);
@ -850,7 +851,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
if (!qp->qplib_qp.srq) {
bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.qprva, bytes,
umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem))
goto rqfail;
@ -1304,7 +1305,8 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE);
umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem))
return PTR_ERR(umem);
@ -2545,7 +2547,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
goto fail;
}
cq->umem = ib_umem_get(udata, req.cq_va,
cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va,
entries * sizeof(struct cq_base),
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->umem)) {
@ -3514,7 +3516,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
/* The fixed portion of the rkey is the same as the lkey */
mr->ib_mr.rkey = mr->qplib_mr.rkey;
umem = ib_umem_get(udata, start, length, mr_access_flags);
umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
if (IS_ERR(umem)) {
dev_err(rdev_to_dev(rdev), "Failed to get umem");
rc = -EFAULT;

View File

@ -543,7 +543,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mhp->rhp = rhp;
mhp->umem = ib_umem_get(udata, start, length, acc);
mhp->umem = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(mhp->umem))
goto err_free_skb;

View File

@ -1358,7 +1358,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
int inline_size;
int err;
if (udata->inlen &&
if (udata && udata->inlen &&
!ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
ibdev_dbg(&dev->ibdev,
"Incompatible ABI params, udata not cleared\n");
@ -1384,7 +1384,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
goto err_out;
}
mr->umem = ib_umem_get(udata, start, length, access_flags);
mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
ibdev_dbg(&dev->ibdev,

View File

@ -163,7 +163,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
u32 npages;
int ret;
*umem = ib_umem_get(udata, ucmd.buf_addr, buf->size,
*umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(*umem))
return PTR_ERR(*umem);

View File

@ -31,7 +31,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context,
refcount_set(&page->refcount, 1);
page->user_virt = page_addr;
page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0);
page->umem = ib_umem_get(context->ibucontext.device, page_addr,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) {
ret = PTR_ERR(page->umem);
kfree(page);

View File

@ -1145,7 +1145,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
mr->umem = ib_umem_get(udata, start, length, access_flags);
mr->umem = ib_umem_get(pd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem);
goto err_free;
@ -1230,7 +1230,7 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags,
}
ib_umem_release(mr->umem);
mr->umem = ib_umem_get(udata, start, length, mr_access_flags);
mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags);
if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem);
mr->umem = NULL;

View File

@ -744,7 +744,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
goto err_alloc_rq_inline_buf;
}
hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr,
hr_qp->buff_size, 0);
if (IS_ERR(hr_qp->umem)) {
dev_err(dev, "ib_umem_get error for create qp\n");

View File

@ -186,7 +186,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
return -EFAULT;
srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0);
srq->umem =
ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0);
if (IS_ERR(srq->umem))
return PTR_ERR(srq->umem);
@ -205,7 +206,7 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
goto err_user_srq_mtt;
/* config index queue BA */
srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr,
srq->idx_que.buf_size, 0);
if (IS_ERR(srq->idx_que.umem)) {
dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");

View File

@ -1756,12 +1756,15 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
int ret;
int pg_shift;
if (!udata)
return ERR_PTR(-EOPNOTSUPP);
if (iwdev->closing)
return ERR_PTR(-ENODEV);
if (length > I40IW_MAX_MR_SIZE)
return ERR_PTR(-EINVAL);
region = ib_umem_get(udata, start, length, acc);
region = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(region))
return (struct ib_mr *)region;

View File

@ -144,7 +144,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
int shift;
int n;
*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
*umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(*umem))
return PTR_ERR(*umem);

View File

@ -64,7 +64,8 @@ int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0);
page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);

View File

@ -367,7 +367,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
return block_shift;
}
static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start,
u64 length, int access_flags)
{
/*
@ -398,7 +398,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
up_read(&current->mm->mmap_sem);
}
return ib_umem_get(udata, start, length, access_flags);
return ib_umem_get(device, start, length, access_flags);
}
struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@ -415,7 +415,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags);
mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
goto err_free;
@ -504,7 +504,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
ib_umem_release(mmr->umem);
mmr->umem = mlx4_get_umem_mr(udata, start, length,
mmr->umem = mlx4_get_umem_mr(mr->device, start, length,
mr_access_flags);
if (IS_ERR(mmr->umem)) {
err = PTR_ERR(mmr->umem);

View File

@ -916,7 +916,7 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
(qp->sq.wqe_cnt << qp->sq.wqe_shift);
qp->umem = ib_umem_get(udata, wq.buf_addr, qp->buf_size, 0);
qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0);
if (IS_ERR(qp->umem)) {
err = PTR_ERR(qp->umem);
goto err;
@ -1110,7 +1110,8 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
if (err)
goto err;
qp->umem = ib_umem_get(udata, ucmd.buf_addr, qp->buf_size, 0);
qp->umem =
ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0);
if (IS_ERR(qp->umem)) {
err = PTR_ERR(qp->umem);
goto err;

View File

@ -110,7 +110,8 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq,
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
return -EFAULT;
srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0);
srq->umem =
ib_umem_get(ib_srq->device, ucmd.buf_addr, buf_size, 0);
if (IS_ERR(srq->umem))
return PTR_ERR(srq->umem);

View File

@ -708,8 +708,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
*cqe_size = ucmd.cqe_size;
cq->buf.umem =
ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size,
IB_ACCESS_LOCAL_WRITE);
ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->buf.umem)) {
err = PTR_ERR(cq->buf.umem);
return err;
@ -1108,7 +1108,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
return -EINVAL;
umem = ib_umem_get(udata, ucmd.buf_addr,
umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
(size_t)ucmd.cqe_size * entries,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem)) {

View File

@ -2134,7 +2134,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
if (err)
return err;
obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access);
obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
if (IS_ERR(obj->umem))
return PTR_ERR(obj->umem);

View File

@ -64,7 +64,8 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0);
page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);

View File

@ -815,6 +815,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props,
struct ib_udata *uhw)
{
size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
int err = -ENOMEM;
@ -828,12 +829,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
u64 max_tso;
resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
if (uhw->outlen && uhw->outlen < resp_len)
if (uhw_outlen && uhw_outlen < resp_len)
return -EINVAL;
resp.response_length = resp_len;
if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
return -EINVAL;
memset(props, 0, sizeof(*props));
@ -897,7 +898,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->raw_packet_caps |=
IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
if (max_tso) {
resp.tso_caps.max_tso = 1 << max_tso;
@ -907,7 +908,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
resp.rss_caps.rx_hash_function =
MLX5_RX_HASH_FUNC_TOEPLITZ;
resp.rss_caps.rx_hash_fields_mask =
@ -927,9 +928,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
resp.response_length += sizeof(resp.rss_caps);
}
} else {
if (field_avail(typeof(resp), tso_caps, uhw->outlen))
if (field_avail(typeof(resp), tso_caps, uhw_outlen))
resp.response_length += sizeof(resp.tso_caps);
if (field_avail(typeof(resp), rss_caps, uhw->outlen))
if (field_avail(typeof(resp), rss_caps, uhw_outlen))
resp.response_length += sizeof(resp.rss_caps);
}
@ -1014,6 +1015,23 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps;
if (!uhw) {
/* ODP for kernel QPs is not implemented for receive
* WQEs and SRQ WQEs
*/
props->odp_caps.per_transport_caps.rc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.uc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.ud_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.xrc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
}
}
if (MLX5_CAP_GEN(mdev, cd))
@ -1054,7 +1072,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_MAX_CQ_PERIOD;
}
if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.cqe_comp_caps);
if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
@ -1072,7 +1090,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) &&
if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
raw_support) {
if (MLX5_CAP_QOS(mdev, packet_pacing) &&
MLX5_CAP_GEN(mdev, qos)) {
@ -1091,7 +1109,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
uhw->outlen)) {
uhw_outlen)) {
if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
resp.mlx5_ib_support_multi_pkt_send_wqes =
MLX5_IB_ALLOW_MPW;
@ -1104,7 +1122,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
}
if (field_avail(typeof(resp), flags, uhw->outlen)) {
if (field_avail(typeof(resp), flags, uhw_outlen)) {
resp.response_length += sizeof(resp.flags);
if (MLX5_CAP_GEN(mdev, cqe_compression_128))
@ -1120,8 +1138,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
}
if (field_avail(typeof(resp), sw_parsing_caps,
uhw->outlen)) {
if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.sw_parsing_caps);
if (MLX5_CAP_ETH(mdev, swp)) {
resp.sw_parsing_caps.sw_parsing_offloads |=
@ -1141,7 +1158,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) &&
if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
raw_support) {
resp.response_length += sizeof(resp.striding_rq_caps);
if (MLX5_CAP_GEN(mdev, striding_rq)) {
@ -1164,8 +1181,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
}
}
if (field_avail(typeof(resp), tunnel_offloads_caps,
uhw->outlen)) {
if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.tunnel_offloads_caps);
if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
resp.tunnel_offloads_caps |=
@ -1186,7 +1202,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
}
if (uhw->outlen) {
if (uhw_outlen) {
err = ib_copy_to_udata(uhw, &resp, resp.response_length);
if (err)
@ -4773,7 +4789,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
struct ib_device_attr *dprops = NULL;
struct ib_port_attr *pprops = NULL;
int err = -ENOMEM;
struct ib_udata uhw = {.inlen = 0, .outlen = 0};
pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
if (!pprops)
@ -4783,7 +4798,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
if (!dprops)
goto out;
err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL);
if (err) {
mlx5_ib_warn(dev, "query_device failed %d\n", err);
goto out;

View File

@ -1153,12 +1153,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
const struct ib_send_wr **bad_wr);
int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr);
int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int buflen, size_t *bc);
int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int buflen, size_t *bc);
int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
void *buffer, int buflen, size_t *bc);
int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
size_t buflen, size_t *bc);
int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
size_t buflen, size_t *bc);
int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
size_t buflen, size_t *bc);
int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_udata *udata);
void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);

View File

@ -737,10 +737,9 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev)
return MLX5_MAX_UMR_SHIFT;
}
static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
u64 start, u64 length, int access_flags,
struct ib_umem **umem, int *npages, int *page_shift,
int *ncont, int *order)
static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
int access_flags, struct ib_umem **umem, int *npages,
int *page_shift, int *ncont, int *order)
{
struct ib_umem *u;
@ -749,7 +748,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_umem_odp *odp;
odp = ib_umem_odp_get(udata, start, length, access_flags,
odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
&mlx5_mn_ops);
if (IS_ERR(odp)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
@ -765,7 +764,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (order)
*order = ilog2(roundup_pow_of_two(*ncont));
} else {
u = ib_umem_get(udata, start, length, access_flags);
u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
if (IS_ERR(u)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
return PTR_ERR(u);
@ -1247,6 +1246,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
length == U64_MAX) {
if (virt_addr != start)
return ERR_PTR(-EINVAL);
if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
return ERR_PTR(-EINVAL);
@ -1257,7 +1258,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return &mr->ibmr;
}
err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
err = mr_umem_get(dev, start, length, access_flags, &umem,
&npages, &page_shift, &ncont, &order);
if (err < 0)
@ -1424,9 +1425,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
flags |= IB_MR_REREG_TRANS;
ib_umem_release(mr->umem);
mr->umem = NULL;
err = mr_umem_get(dev, udata, addr, len, access_flags,
&mr->umem, &npages, &page_shift, &ncont,
&order);
err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
&npages, &page_shift, &ncont, &order);
if (err)
goto err;
}

View File

@ -497,7 +497,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct mlx5_ib_mr *imr;
int err;
umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
if (IS_ERR(umem_odp))
return ERR_CAST(umem_odp);
@ -624,11 +624,10 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
unsigned long current_seq;
u64 access_mask;
u64 start_idx, page_mask;
u64 start_idx;
page_shift = odp->page_shift;
page_mask = ~(BIT(page_shift) - 1);
start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift;
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
if (odp->umem.writable && !downgrade)
@ -767,11 +766,19 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
if (unlikely(io_virt < mr->mmkey.iova))
return -EFAULT;
if (!odp->is_implicit_odp) {
if (unlikely(io_virt < ib_umem_start(odp) ||
ib_umem_end(odp) - io_virt < bcnt))
u64 user_va;
if (check_add_overflow(io_virt - mr->mmkey.iova,
(u64)odp->umem.address, &user_va))
return -EFAULT;
return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped,
if (unlikely(user_va >= ib_umem_end(odp) ||
ib_umem_end(odp) - user_va < bcnt))
return -EFAULT;
return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
flags);
}
return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
@ -1237,15 +1244,15 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
wqe = wqe_start;
qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
if (qp && sq) {
ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
&bytes_copied);
ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
&bytes_copied);
if (ret)
goto read_user;
ret = mlx5_ib_mr_initiator_pfault_handler(
dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
} else if (qp && !sq) {
ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
&bytes_copied);
ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
&bytes_copied);
if (ret)
goto read_user;
ret = mlx5_ib_mr_responder_pfault_handler_rq(
@ -1253,8 +1260,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
} else if (!qp) {
struct mlx5_ib_srq *srq = res_to_srq(res);
ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
&bytes_copied);
ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
&bytes_copied);
if (ret)
goto read_user;
ret = mlx5_ib_mr_responder_pfault_handler_srq(

View File

@ -129,14 +129,10 @@ static int is_sqp(enum ib_qp_type qp_type)
*
* Return: zero on success, or an error code.
*/
static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
void *buffer,
u32 buflen,
int wqe_index,
int wq_offset,
int wq_wqe_cnt,
int wq_wqe_shift,
int bcnt,
static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer,
size_t buflen, int wqe_index,
int wq_offset, int wq_wqe_cnt,
int wq_wqe_shift, int bcnt,
size_t *bytes_copied)
{
size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
@ -160,11 +156,43 @@ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
return 0;
}
int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
int wqe_index,
void *buffer,
int buflen,
size_t *bc)
static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{
struct mlx5_wqe_ctrl_seg *ctrl;
size_t bytes_copied = 0;
size_t wqe_length;
void *p;
int ds;
wqe_index = wqe_index & qp->sq.fbc.sz_m1;
/* read the control segment first */
p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
ctrl = p;
ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
wqe_length = ds * MLX5_WQE_DS_UNITS;
/* read rest of WQE if it spreads over more than one stride */
while (bytes_copied < wqe_length) {
size_t copy_length =
min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB);
if (!copy_length)
break;
memcpy(buffer + bytes_copied, p, copy_length);
bytes_copied += copy_length;
wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1;
p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
}
*bc = bytes_copied;
return 0;
}
static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
@ -176,18 +204,10 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
int ret;
int ds;
if (buflen < sizeof(*ctrl))
return -EINVAL;
/* at first read as much as possible */
ret = mlx5_ib_read_user_wqe_common(umem,
buffer,
buflen,
wqe_index,
wq->offset,
wq->wqe_cnt,
wq->wqe_shift,
buflen,
ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
wq->offset, wq->wqe_cnt,
wq->wqe_shift, buflen,
&bytes_copied);
if (ret)
return ret;
@ -210,13 +230,9 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
* so read the remaining bytes starting
* from wqe_index 0
*/
ret = mlx5_ib_read_user_wqe_common(umem,
buffer + bytes_copied,
buflen - bytes_copied,
0,
wq->offset,
wq->wqe_cnt,
wq->wqe_shift,
ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied,
buflen - bytes_copied, 0, wq->offset,
wq->wqe_cnt, wq->wqe_shift,
wqe_length - bytes_copied,
&bytes_copied2);
@ -226,11 +242,24 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
return 0;
}
int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
int wqe_index,
void *buffer,
int buflen,
size_t *bc)
int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
size_t buflen, size_t *bc)
{
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
if (buflen < sizeof(struct mlx5_wqe_ctrl_seg))
return -EINVAL;
if (!umem)
return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer,
buflen, bc);
return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc);
}
static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
@ -238,14 +267,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
size_t bytes_copied;
int ret;
ret = mlx5_ib_read_user_wqe_common(umem,
buffer,
buflen,
wqe_index,
wq->offset,
wq->wqe_cnt,
wq->wqe_shift,
buflen,
ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
wq->offset, wq->wqe_cnt,
wq->wqe_shift, buflen,
&bytes_copied);
if (ret)
@ -254,25 +278,33 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
return 0;
}
int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
int wqe_index,
void *buffer,
int buflen,
size_t *bc)
int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
size_t buflen, size_t *bc)
{
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem;
struct mlx5_ib_wq *wq = &qp->rq;
size_t wqe_size = 1 << wq->wqe_shift;
if (buflen < wqe_size)
return -EINVAL;
if (!umem)
return -EOPNOTSUPP;
return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc);
}
static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{
struct ib_umem *umem = srq->umem;
size_t bytes_copied;
int ret;
ret = mlx5_ib_read_user_wqe_common(umem,
buffer,
buflen,
wqe_index,
0,
srq->msrq.max,
srq->msrq.wqe_shift,
buflen,
&bytes_copied);
ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0,
srq->msrq.max, srq->msrq.wqe_shift,
buflen, &bytes_copied);
if (ret)
return ret;
@ -280,6 +312,21 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
return 0;
}
int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
size_t buflen, size_t *bc)
{
struct ib_umem *umem = srq->umem;
size_t wqe_size = 1 << srq->msrq.wqe_shift;
if (buflen < wqe_size)
return -EINVAL;
if (!umem)
return -EOPNOTSUPP;
return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
}
static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
{
struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@ -749,7 +796,7 @@ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
{
int err;
*umem = ib_umem_get(udata, addr, size, 0);
*umem = ib_umem_get(&dev->ib_dev, addr, size, 0);
if (IS_ERR(*umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
return PTR_ERR(*umem);
@ -806,7 +853,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (!ucmd->buf_addr)
return -EINVAL;
rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0);
rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0);
if (IS_ERR(rwq->umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n");
err = PTR_ERR(rwq->umem);

View File

@ -80,7 +80,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0);
srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0);
if (IS_ERR(srq->umem)) {
mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
err = PTR_ERR(srq->umem);

View File

@ -880,7 +880,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
mr->umem = ib_umem_get(udata, start, length, acc);
mr->umem = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
goto err;

View File

@ -869,7 +869,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr = kzalloc(sizeof(*mr), GFP_KERNEL);
if (!mr)
return ERR_PTR(status);
mr->umem = ib_umem_get(udata, start, len, acc);
mr->umem = ib_umem_get(ibpd->device, start, len, acc);
if (IS_ERR(mr->umem)) {
status = -EFAULT;
goto umem_err;

View File

@ -772,7 +772,7 @@ static inline int qedr_init_user_queue(struct ib_udata *udata,
q->buf_addr = buf_addr;
q->buf_len = buf_len;
q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access);
q->umem = ib_umem_get(&dev->ibdev, q->buf_addr, q->buf_len, access);
if (IS_ERR(q->umem)) {
DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
PTR_ERR(q->umem));
@ -1415,9 +1415,8 @@ static int qedr_init_srq_user_params(struct ib_udata *udata,
if (rc)
return rc;
srq->prod_umem =
ib_umem_get(udata, ureq->prod_pair_addr,
sizeof(struct rdma_srq_producers), access);
srq->prod_umem = ib_umem_get(srq->ibsrq.device, ureq->prod_pair_addr,
sizeof(struct rdma_srq_producers), access);
if (IS_ERR(srq->prod_umem)) {
qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl);
ib_umem_release(srq->usrq.umem);
@ -2839,7 +2838,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr->type = QEDR_MR_USER;
mr->umem = ib_umem_get(udata, start, len, acc);
mr->umem = ib_umem_get(ibpd->device, start, len, acc);
if (IS_ERR(mr->umem)) {
rc = -EFAULT;
goto err0;

View File

@ -135,7 +135,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
goto err_cq;
}
cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size,
cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, ucmd.buf_size,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->umem)) {
ret = PTR_ERR(cq->umem);

View File

@ -126,7 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_PTR(-EINVAL);
}
umem = ib_umem_get(udata, start, length, access_flags);
umem = ib_umem_get(pd->device, start, length, access_flags);
if (IS_ERR(umem)) {
dev_warn(&dev->pdev->dev,
"could not get umem for mem region\n");

View File

@ -276,8 +276,9 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
if (!is_srq) {
/* set qp->sq.wqe_cnt, shift, buf_size.. */
qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr,
ucmd.rbuf_size, 0);
qp->rumem =
ib_umem_get(pd->device, ucmd.rbuf_addr,
ucmd.rbuf_size, 0);
if (IS_ERR(qp->rumem)) {
ret = PTR_ERR(qp->rumem);
goto err_qp;
@ -288,7 +289,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
qp->srq = to_vsrq(init_attr->srq);
}
qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr,
qp->sumem = ib_umem_get(pd->device, ucmd.sbuf_addr,
ucmd.sbuf_size, 0);
if (IS_ERR(qp->sumem)) {
if (!is_srq)

View File

@ -146,7 +146,7 @@ int pvrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
goto err_srq;
}
srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0);
srq->umem = ib_umem_get(ibsrq->device, ucmd.buf_addr, ucmd.buf_size, 0);
if (IS_ERR(srq->umem)) {
ret = PTR_ERR(srq->umem);
goto err_srq;

View File

@ -390,7 +390,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (length == 0)
return ERR_PTR(-EINVAL);
umem = ib_umem_get(udata, start, length, mr_access_flags);
umem = ib_umem_get(pd->device, start, length, mr_access_flags);
if (IS_ERR(umem))
return (void *)umem;

View File

@ -169,7 +169,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
void *vaddr;
int err;
umem = ib_umem_get(udata, start, length, access);
umem = ib_umem_get(pd->ibpd.device, start, length, access);
if (IS_ERR(umem)) {
pr_warn("err %d from rxe_umem_get\n",
(int)PTR_ERR(umem));

View File

@ -69,7 +69,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem)
#ifdef CONFIG_INFINIBAND_USER_MEM
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
size_t size, int access);
void ib_umem_release(struct ib_umem *umem);
int ib_umem_page_count(struct ib_umem *umem);
@ -83,7 +83,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
#include <linux/err.h>
static inline struct ib_umem *ib_umem_get(struct ib_udata *udata,
static inline struct ib_umem *ib_umem_get(struct ib_device *device,
unsigned long addr, size_t size,
int access)
{

View File

@ -114,9 +114,9 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_umem_odp *
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size,
int access, const struct mmu_interval_notifier_ops *ops);
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
int access);
struct ib_umem_odp *
ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
@ -134,7 +134,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline struct ib_umem_odp *
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size,
int access, const struct mmu_interval_notifier_ops *ops)
{
return ERR_PTR(-EINVAL);

View File

@ -4153,6 +4153,15 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
}
/* ib_reg_user_mr - register a memory region for virtual addresses from kernel
* space. This function should be called when 'current' is the owning MM.
*/
struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags);
/* ib_advise_mr - give an advice about an address range in a memory region */
int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge);
/**
* ib_dereg_mr_user - Deregisters a memory region and removes it from the
* HCA translation table.

View File

@ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device)
has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr &&
device->ops.map_phys_fmr && device->ops.unmap_fmr);
rds_ibdev->use_fastreg = (has_fr && !has_fmr);
rds_ibdev->odp_capable =
!!(device->attrs.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING) &&
!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_WRITE) &&
!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_READ);
rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
rds_ibdev->max_1m_mrs = device->attrs.max_mr ?

View File

@ -247,7 +247,8 @@ struct rds_ib_device {
struct ib_device *dev;
struct ib_pd *pd;
struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */
bool use_fastreg;
u8 use_fastreg:1;
u8 odp_capable:1;
unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;

View File

@ -67,6 +67,7 @@ struct rds_ib_frmr {
/* This is stored as mr->r_trans_private. */
struct rds_ib_mr {
struct delayed_work work;
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct rds_ib_connection *ic;
@ -81,9 +82,11 @@ struct rds_ib_mr {
unsigned int sg_len;
int sg_dma_len;
u8 odp:1;
union {
struct rds_ib_fmr fmr;
struct rds_ib_frmr frmr;
struct ib_mr *mr;
} u;
};
@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
struct rds_connection *conn);
struct rds_connection *conn, u64 start, u64 length,
int need_odp);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
int rds_ib_mr_init(void);
void rds_ib_mr_exit(void);
u32 rds_ib_get_lkey(void *trans_private);
void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *);

View File

@ -37,8 +37,15 @@
#include "rds_single_path.h"
#include "ib_mr.h"
#include "rds.h"
struct workqueue_struct *rds_ib_mr_wq;
struct rds_ib_dereg_odp_mr {
struct work_struct work;
struct ib_mr *mr;
};
static void rds_ib_odp_mr_worker(struct work_struct *work);
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction)
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
if (ibmr->odp)
return;
switch (direction) {
case DMA_FROM_DEVICE:
ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
if (ibmr->odp) {
/* A MR created and marked as use_once. We use delayed work,
* because there is a change that we are in interrupt and can't
* call to ib_dereg_mr() directly.
*/
INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
return;
}
/* Return it to the pool's free list */
if (rds_ibdev->use_fastreg)
rds_ib_free_frmr_list(ibmr);
@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void)
up_read(&rds_ib_devices_lock);
}
u32 rds_ib_get_lkey(void *trans_private)
{
struct rds_ib_mr *ibmr = trans_private;
return ibmr->u.mr->lkey;
}
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
struct rds_connection *conn)
struct rds_connection *conn,
u64 start, u64 length, int need_odp)
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
@ -541,6 +569,51 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}
if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
int access_flags =
(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
IB_ACCESS_ON_DEMAND);
struct ib_sge sge = {};
struct ib_mr *ib_mr;
if (!rds_ibdev->odp_capable) {
ret = -EOPNOTSUPP;
goto out;
}
ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
access_flags);
if (IS_ERR(ib_mr)) {
rdsdebug("rds_ib_get_user_mr returned %d\n",
IS_ERR(ib_mr));
ret = PTR_ERR(ib_mr);
goto out;
}
if (key_ret)
*key_ret = ib_mr->rkey;
ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
if (!ibmr) {
ib_dereg_mr(ib_mr);
ret = -ENOMEM;
goto out;
}
ibmr->u.mr = ib_mr;
ibmr->odp = 1;
sge.addr = virt_addr;
sge.length = length;
sge.lkey = ib_mr->lkey;
ib_advise_mr(rds_ibdev->pd,
IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
return ibmr;
}
if (conn)
ic = conn->c_transport_data;
@ -629,3 +702,12 @@ void rds_ib_mr_exit(void)
{
destroy_workqueue(rds_ib_mr_wq);
}
static void rds_ib_odp_mr_worker(struct work_struct *work)
{
struct rds_ib_mr *ibmr;
ibmr = container_of(work, struct rds_ib_mr, work.work);
ib_dereg_mr(ibmr->u.mr);
kfree(ibmr);
}

View File

@ -39,6 +39,7 @@
#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
#include "ib_mr.h"
/*
* Convert IB-specific error message to RDS error message and call core
@ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];
send->s_sge[0].length = sizeof(struct rds_header);
send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
sizeof(struct rds_header));
@ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[1].addr = sg_dma_address(scat);
send->s_sge[1].addr += rm->data.op_dmaoff;
send->s_sge[1].length = len;
send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
bytes_sent += len;
rm->data.op_dmaoff += len;
@ -858,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
int ret;
int num_sge;
int nr_sig = 0;
u64 odp_addr = op->op_odp_addr;
u32 odp_lkey = 0;
/* map the op the first time we see it */
if (!op->op_mapped) {
op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
op->op_sg, op->op_nents, (op->op_write) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE);
rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
if (op->op_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
if (!op->op_odp_mr) {
if (!op->op_mapped) {
op->op_count =
ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
op->op_nents,
(op->op_write) ? DMA_TO_DEVICE :
DMA_FROM_DEVICE);
rdsdebug("ic %p mapping op %p: %d\n", ic, op,
op->op_count);
if (op->op_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
op->op_mapped = 1;
}
op->op_mapped = 1;
} else {
op->op_count = op->op_nents;
odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
}
/*
@ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
scat != &op->op_sg[op->op_count]; j++) {
len = sg_dma_len(scat);
send->s_sge[j].addr = sg_dma_address(scat);
if (!op->op_odp_mr) {
send->s_sge[j].addr = sg_dma_address(scat);
send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
} else {
send->s_sge[j].addr = odp_addr;
send->s_sge[j].lkey = odp_lkey;
}
send->s_sge[j].length = len;
send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
sent += len;
rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
remote_addr += len;
odp_addr += len;
scat++;
}

View File

@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
struct page **pages, int write)
{
unsigned int gup_flags = FOLL_LONGTERM;
int ret;
ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
pages);
if (write)
gup_flags |= FOLL_WRITE;
ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
if (ret >= 0 && ret < nr_pages) {
while (ret--)
put_page(pages[ret]);
@ -175,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
struct rds_conn_path *cp)
{
struct rds_mr *mr = NULL, *found;
struct scatterlist *sg = NULL;
unsigned int nr_pages;
struct page **pages = NULL;
struct scatterlist *sg;
void *trans_private;
unsigned long flags;
rds_rdma_cookie_t cookie;
unsigned int nents;
unsigned int nents = 0;
int need_odp = 0;
long i;
int ret;
@ -195,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
}
/* If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error.
*/
if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
(args->vec.addr + args->vec.bytes)) {
ret = -EINVAL;
goto out;
}
if (!can_do_mlock()) {
ret = -EPERM;
goto out;
}
nr_pages = rds_pages_in_vec(&args->vec);
if (nr_pages == 0) {
ret = -EINVAL;
@ -248,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* the zero page.
*/
ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
if (ret < 0)
if (ret == -EOPNOTSUPP) {
need_odp = 1;
} else if (ret <= 0) {
goto out;
} else {
nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
}
WARN_ON(!nents);
sg_init_table(sg, nents);
nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
/* Stick all pages into the scatterlist */
for (i = 0 ; i < nents; i++)
sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
rdsdebug("RDS: trans_private nents is %u\n", nents);
}
WARN_ON(!nents);
sg_init_table(sg, nents);
/* Stick all pages into the scatterlist */
for (i = 0 ; i < nents; i++)
sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
rdsdebug("RDS: trans_private nents is %u\n", nents);
/* Obtain a transport specific MR. If this succeeds, the
* s/g list is now owned by the MR.
* Note that dma_map() implies that pending writes are
* flushed to RAM, so no dma_sync is needed here. */
trans_private = rs->rs_transport->get_mr(sg, nents, rs,
&mr->r_key,
cp ? cp->cp_conn : NULL);
trans_private = rs->rs_transport->get_mr(
sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
args->vec.addr, args->vec.bytes,
need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
if (IS_ERR(trans_private)) {
for (i = 0 ; i < nents; i++)
put_page(sg_page(&sg[i]));
kfree(sg);
/* In ODP case, we don't GUP pages, so don't need
* to release anything.
*/
if (!need_odp) {
for (i = 0 ; i < nents; i++)
put_page(sg_page(&sg[i]));
kfree(sg);
}
ret = PTR_ERR(trans_private);
goto out;
}
@ -291,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* map page aligned regions. So we keep the offset, and build
* a 64bit cookie containing <R_Key, offset> and pass that
* around. */
cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
if (need_odp)
cookie = rds_rdma_make_cookie(mr->r_key, 0);
else
cookie = rds_rdma_make_cookie(mr->r_key,
args->vec.addr & ~PAGE_MASK);
if (cookie_ret)
*cookie_ret = cookie;
@ -456,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
{
unsigned int i;
for (i = 0; i < ro->op_nents; i++) {
struct page *page = sg_page(&ro->op_sg[i]);
if (ro->op_odp_mr) {
rds_mr_put(ro->op_odp_mr);
} else {
for (i = 0; i < ro->op_nents; i++) {
struct page *page = sg_page(&ro->op_sg[i]);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
if (!ro->op_write) {
WARN_ON(!page->mapping && irqs_disabled());
set_page_dirty(page);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory
*/
if (!ro->op_write)
set_page_dirty(page);
put_page(page);
}
put_page(page);
}
kfree(ro->op_notifier);
ro->op_notifier = NULL;
ro->op_active = 0;
ro->op_odp_mr = NULL;
}
void rds_atomic_free_op(struct rm_atomic_op *ao)
@ -581,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct rds_iovec *iovs;
unsigned int i, j;
int ret = 0;
bool odp_supported = true;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
|| rm->rdma.op_active)
@ -602,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
ret = -EINVAL;
goto out_ret;
}
/* odp-mr is not supported for multiple requests within one message */
if (args->nr_local != 1)
odp_supported = false;
iovs = vec->iov;
@ -623,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
op->op_active = 1;
op->op_recverr = rs->rs_recverr;
op->op_odp_mr = NULL;
WARN_ON(!nr_pages);
op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret);
if (!op->op_sg)
@ -672,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
* If it's a READ operation, we need to pin the pages for writing.
*/
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
if (ret < 0)
if ((!odp_supported && ret <= 0) ||
(odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
goto out_pages;
else
ret = 0;
if (ret == -EOPNOTSUPP) {
struct rds_mr *local_odp_mr;
if (!rs->rs_transport->get_mr) {
ret = -EOPNOTSUPP;
goto out_pages;
}
local_odp_mr =
kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
if (!local_odp_mr) {
ret = -ENOMEM;
goto out_pages;
}
RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
refcount_set(&local_odp_mr->r_refcount, 1);
local_odp_mr->r_trans = rs->rs_transport;
local_odp_mr->r_sock = rs;
local_odp_mr->r_trans_private =
rs->rs_transport->get_mr(
NULL, 0, rs, &local_odp_mr->r_key, NULL,
iov->addr, iov->bytes, ODP_VIRTUAL);
if (IS_ERR(local_odp_mr->r_trans_private)) {
ret = IS_ERR(local_odp_mr->r_trans_private);
rdsdebug("get_mr ret %d %p\"", ret,
local_odp_mr->r_trans_private);
kfree(local_odp_mr);
ret = -EOPNOTSUPP;
goto out_pages;
}
rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
local_odp_mr, local_odp_mr->r_trans_private);
op->op_odp_mr = local_odp_mr;
op->op_odp_addr = iov->addr;
}
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr);
@ -691,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
offset);
sg_dma_len(sg) = sg->length;
rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
sg->offset, sg->length, iov->addr, iov->bytes);
@ -709,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
goto out_pages;
}
op->op_bytes = nr_bytes;
ret = 0;
out_pages:
kfree(pages);
@ -755,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (mr) {
mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
mr->r_trans->sync_mr(mr->r_trans_private,
DMA_TO_DEVICE);
rm->rdma.op_rdma_mr = mr;
}
return err;

View File

@ -40,7 +40,6 @@
#ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64
#endif
#ifdef RDS_DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
@ -478,6 +477,9 @@ struct rds_message {
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
u64 op_odp_addr;
struct rds_mr *op_odp_mr;
} rdma;
struct rm_data_op {
unsigned int op_active:1;
@ -573,7 +575,8 @@ struct rds_transport {
void (*exit)(void);
void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
struct rds_sock *rs, u32 *key_ret,
struct rds_connection *conn);
struct rds_connection *conn,
u64 start, u64 length, int need_odp);
void (*sync_mr)(void *trans_private, int direction);
void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void);
@ -956,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn)
(conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
}
enum {
ODP_NOT_NEEDED,
ODP_ZEROBASED,
ODP_VIRTUAL
};
/* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do { \