Mark Bloch 1fb7f8973f RDMA: Support more than 255 rdma ports
Current code uses many different types when dealing with a port of a RDMA
device: u8, unsigned int and u32. Switch to u32 to clean up the logic.

This allows us to make (at least) the core view consistent and use the
same type. Unfortunately not all places can be converted. Many uverbs
functions expect port to be u8 so keep those places in order not to break
UAPIs.  HW/Spec defined values must also not be changed.

With the switch to u32 we now can support devices with more than 255
ports. U32_MAX is reserved to make control logic a bit easier to deal
with. As a device with U32_MAX ports probably isn't going to happen any
time soon this seems like a non issue.

When a device with more than 255 ports is created uverbs will report the
RDMA device as having 255 ports as this is the max currently supported.

The verbs interface is not changed yet because the IBTA spec limits the
port size in too many places to be u8 and all applications that relies in
verbs won't be able to cope with this change. At this stage, we are
extending the interfaces that are using vendor channel solely

Once the limitation is lifted mlx5 in switchdev mode will be able to have
thousands of SFs created by the device. As the only instance of an RDMA
device that reports more than 255 ports will be a representor device and
it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other
ULPs aren't effected by this change and their sysfs/interfaces that are
exposes to userspace can remain unchanged.

While here cleanup some alignment issues and remove unneeded sanity
checks (mainly in rdmavt),

Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-26 09:31:21 -03:00

570 lines
16 KiB
C

/*
* Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/device.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
#include <linux/inetdevice.h>
#include <linux/io.h>
#include <asm/irq.h>
#include <asm/byteorder.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
#include "iw_cxgb4.h"
static int fastreg_support = 1;
module_param(fastreg_support, int, 0644);
MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
static void c4iw_dealloc_ucontext(struct ib_ucontext *context)
{
struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
struct c4iw_dev *rhp;
struct c4iw_mm_entry *mm, *tmp;
pr_debug("context %p\n", context);
rhp = to_c4iw_dev(ucontext->ibucontext.device);
list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
kfree(mm);
c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
}
static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext,
struct ib_udata *udata)
{
struct ib_device *ibdev = ucontext->device;
struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext);
struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
struct c4iw_alloc_ucontext_resp uresp;
int ret = 0;
struct c4iw_mm_entry *mm = NULL;
pr_debug("ibdev %p\n", ibdev);
c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
INIT_LIST_HEAD(&context->mmaps);
spin_lock_init(&context->mmap_lock);
if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n");
rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED;
} else {
mm = kmalloc(sizeof(*mm), GFP_KERNEL);
if (!mm) {
ret = -ENOMEM;
goto err;
}
uresp.status_page_size = PAGE_SIZE;
spin_lock(&context->mmap_lock);
uresp.status_page_key = context->key;
context->key += PAGE_SIZE;
spin_unlock(&context->mmap_lock);
ret = ib_copy_to_udata(udata, &uresp,
sizeof(uresp) - sizeof(uresp.reserved));
if (ret)
goto err_mm;
mm->key = uresp.status_page_key;
mm->addr = virt_to_phys(rhp->rdev.status_page);
mm->len = PAGE_SIZE;
insert_mmap(context, mm);
}
return 0;
err_mm:
kfree(mm);
err:
return ret;
}
static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
{
int len = vma->vm_end - vma->vm_start;
u32 key = vma->vm_pgoff << PAGE_SHIFT;
struct c4iw_rdev *rdev;
int ret = 0;
struct c4iw_mm_entry *mm;
struct c4iw_ucontext *ucontext;
u64 addr;
pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff,
key, len);
if (vma->vm_start & (PAGE_SIZE-1))
return -EINVAL;
rdev = &(to_c4iw_dev(context->device)->rdev);
ucontext = to_c4iw_ucontext(context);
mm = remove_mmap(ucontext, key, len);
if (!mm)
return -EINVAL;
addr = mm->addr;
kfree(mm);
if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) &&
(addr < (pci_resource_start(rdev->lldi.pdev, 0) +
pci_resource_len(rdev->lldi.pdev, 0)))) {
/*
* MA_SYNC register...
*/
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
ret = io_remap_pfn_range(vma, vma->vm_start,
addr >> PAGE_SHIFT,
len, vma->vm_page_prot);
} else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
(addr < (pci_resource_start(rdev->lldi.pdev, 2) +
pci_resource_len(rdev->lldi.pdev, 2)))) {
/*
* Map user DB or OCQP memory...
*/
if (addr >= rdev->oc_mw_pa)
vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot);
else {
if (!is_t4(rdev->lldi.adapter_type))
vma->vm_page_prot =
t4_pgprot_wc(vma->vm_page_prot);
else
vma->vm_page_prot =
pgprot_noncached(vma->vm_page_prot);
}
ret = io_remap_pfn_range(vma, vma->vm_start,
addr >> PAGE_SHIFT,
len, vma->vm_page_prot);
} else {
/*
* Map WQ or CQ contig dma memory...
*/
ret = remap_pfn_range(vma, vma->vm_start,
addr >> PAGE_SHIFT,
len, vma->vm_page_prot);
}
return ret;
}
static int c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
{
struct c4iw_dev *rhp;
struct c4iw_pd *php;
php = to_c4iw_pd(pd);
rhp = php->rhp;
pr_debug("ibpd %p pdid 0x%x\n", pd, php->pdid);
c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid);
mutex_lock(&rhp->rdev.stats.lock);
rhp->rdev.stats.pd.cur--;
mutex_unlock(&rhp->rdev.stats.lock);
return 0;
}
static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
{
struct c4iw_pd *php = to_c4iw_pd(pd);
struct ib_device *ibdev = pd->device;
u32 pdid;
struct c4iw_dev *rhp;
pr_debug("ibdev %p\n", ibdev);
rhp = (struct c4iw_dev *) ibdev;
pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table);
if (!pdid)
return -EINVAL;
php->pdid = pdid;
php->rhp = rhp;
if (udata) {
struct c4iw_alloc_pd_resp uresp = {.pdid = php->pdid};
if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
c4iw_deallocate_pd(&php->ibpd, udata);
return -EFAULT;
}
}
mutex_lock(&rhp->rdev.stats.lock);
rhp->rdev.stats.pd.cur++;
if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max)
rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
mutex_unlock(&rhp->rdev.stats.lock);
pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
return 0;
}
static int c4iw_query_gid(struct ib_device *ibdev, u32 port, int index,
union ib_gid *gid)
{
struct c4iw_dev *dev;
pr_debug("ibdev %p, port %u, index %d, gid %p\n",
ibdev, port, index, gid);
if (!port)
return -EINVAL;
dev = to_c4iw_dev(ibdev);
memset(&(gid->raw[0]), 0, sizeof(gid->raw));
memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6);
return 0;
}
static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
struct ib_udata *uhw)
{
struct c4iw_dev *dev;
pr_debug("ibdev %p\n", ibdev);
if (uhw->inlen || uhw->outlen)
return -EINVAL;
dev = to_c4iw_dev(ibdev);
memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
props->fw_ver = dev->rdev.lldi.fw_vers;
props->device_cap_flags = dev->device_cap_flags;
props->page_size_cap = T4_PAGESIZE_MASK;
props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor;
props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device;
props->max_mr_size = T4_MAX_MR_SIZE;
props->max_qp = dev->rdev.lldi.vr->qp.size / 2;
props->max_srq = dev->rdev.lldi.vr->srq.size;
props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth;
props->max_srq_wr = dev->rdev.hw_queue.t4_max_qp_depth;
props->max_send_sge = min(T4_MAX_SEND_SGE, T4_MAX_WRITE_SGE);
props->max_recv_sge = T4_MAX_RECV_SGE;
props->max_srq_sge = T4_MAX_RECV_SGE;
props->max_sge_rd = 1;
props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter;
props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp,
c4iw_max_read_depth);
props->max_qp_init_rd_atom = props->max_qp_rd_atom;
props->max_cq = dev->rdev.lldi.vr->qp.size;
props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth;
props->max_mr = c4iw_num_stags(&dev->rdev);
props->max_pd = T4_MAX_NUM_PD;
props->local_ca_ack_delay = 0;
props->max_fast_reg_page_list_len =
t4_max_fr_depth(dev->rdev.lldi.ulptx_memwrite_dsgl && use_dsgl);
return 0;
}
static int c4iw_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props)
{
int ret = 0;
pr_debug("ibdev %p\n", ibdev);
ret = ib_get_eth_speed(ibdev, port, &props->active_speed,
&props->active_width);
props->port_cap_flags =
IB_PORT_CM_SUP |
IB_PORT_SNMP_TUNNEL_SUP |
IB_PORT_REINIT_SUP |
IB_PORT_DEVICE_MGMT_SUP |
IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
props->gid_tbl_len = 1;
props->max_msg_sz = -1;
return ret;
}
static ssize_t hw_rev_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct c4iw_dev *c4iw_dev =
rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
pr_debug("dev 0x%p\n", dev);
return sysfs_emit(
buf, "%d\n",
CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
}
static DEVICE_ATTR_RO(hw_rev);
static ssize_t hca_type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct c4iw_dev *c4iw_dev =
rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
struct ethtool_drvinfo info;
struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
pr_debug("dev 0x%p\n", dev);
lldev->ethtool_ops->get_drvinfo(lldev, &info);
return sysfs_emit(buf, "%s\n", info.driver);
}
static DEVICE_ATTR_RO(hca_type);
static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct c4iw_dev *c4iw_dev =
rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
pr_debug("dev 0x%p\n", dev);
return sysfs_emit(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
c4iw_dev->rdev.lldi.pdev->device);
}
static DEVICE_ATTR_RO(board_id);
enum counters {
IP4INSEGS,
IP4OUTSEGS,
IP4RETRANSSEGS,
IP4OUTRSTS,
IP6INSEGS,
IP6OUTSEGS,
IP6RETRANSSEGS,
IP6OUTRSTS,
NR_COUNTERS
};
static const char * const names[] = {
[IP4INSEGS] = "ip4InSegs",
[IP4OUTSEGS] = "ip4OutSegs",
[IP4RETRANSSEGS] = "ip4RetransSegs",
[IP4OUTRSTS] = "ip4OutRsts",
[IP6INSEGS] = "ip6InSegs",
[IP6OUTSEGS] = "ip6OutSegs",
[IP6RETRANSSEGS] = "ip6RetransSegs",
[IP6OUTRSTS] = "ip6OutRsts"
};
static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
u32 port_num)
{
BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
if (port_num != 0)
return NULL;
return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
RDMA_HW_STATS_DEFAULT_LIFESPAN);
}
static int c4iw_get_mib(struct ib_device *ibdev,
struct rdma_hw_stats *stats,
u32 port, int index)
{
struct tp_tcp_stats v4, v6;
struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
stats->value[IP4INSEGS] = v4.tcp_in_segs;
stats->value[IP4OUTSEGS] = v4.tcp_out_segs;
stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs;
stats->value[IP4OUTRSTS] = v4.tcp_out_rsts;
stats->value[IP6INSEGS] = v6.tcp_in_segs;
stats->value[IP6OUTSEGS] = v6.tcp_out_segs;
stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs;
stats->value[IP6OUTRSTS] = v6.tcp_out_rsts;
return stats->num_counters;
}
static struct attribute *c4iw_class_attributes[] = {
&dev_attr_hw_rev.attr,
&dev_attr_hca_type.attr,
&dev_attr_board_id.attr,
NULL
};
static const struct attribute_group c4iw_attr_group = {
.attrs = c4iw_class_attributes,
};
static int c4iw_port_immutable(struct ib_device *ibdev, u32 port_num,
struct ib_port_immutable *immutable)
{
struct ib_port_attr attr;
int err;
immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
err = ib_query_port(ibdev, port_num, &attr);
if (err)
return err;
immutable->gid_tbl_len = attr.gid_tbl_len;
return 0;
}
static void get_dev_fw_str(struct ib_device *dev, char *str)
{
struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
ibdev);
pr_debug("dev 0x%p\n", dev);
snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u",
FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
}
static const struct ib_device_ops c4iw_dev_ops = {
.owner = THIS_MODULE,
.driver_id = RDMA_DRIVER_CXGB4,
.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION,
.alloc_hw_stats = c4iw_alloc_stats,
.alloc_mr = c4iw_alloc_mr,
.alloc_pd = c4iw_allocate_pd,
.alloc_ucontext = c4iw_alloc_ucontext,
.create_cq = c4iw_create_cq,
.create_qp = c4iw_create_qp,
.create_srq = c4iw_create_srq,
.dealloc_pd = c4iw_deallocate_pd,
.dealloc_ucontext = c4iw_dealloc_ucontext,
.dereg_mr = c4iw_dereg_mr,
.destroy_cq = c4iw_destroy_cq,
.destroy_qp = c4iw_destroy_qp,
.destroy_srq = c4iw_destroy_srq,
.fill_res_cq_entry = c4iw_fill_res_cq_entry,
.fill_res_cm_id_entry = c4iw_fill_res_cm_id_entry,
.fill_res_mr_entry = c4iw_fill_res_mr_entry,
.get_dev_fw_str = get_dev_fw_str,
.get_dma_mr = c4iw_get_dma_mr,
.get_hw_stats = c4iw_get_mib,
.get_port_immutable = c4iw_port_immutable,
.iw_accept = c4iw_accept_cr,
.iw_add_ref = c4iw_qp_add_ref,
.iw_connect = c4iw_connect,
.iw_create_listen = c4iw_create_listen,
.iw_destroy_listen = c4iw_destroy_listen,
.iw_get_qp = c4iw_get_qp,
.iw_reject = c4iw_reject_cr,
.iw_rem_ref = c4iw_qp_rem_ref,
.map_mr_sg = c4iw_map_mr_sg,
.mmap = c4iw_mmap,
.modify_qp = c4iw_ib_modify_qp,
.modify_srq = c4iw_modify_srq,
.poll_cq = c4iw_poll_cq,
.post_recv = c4iw_post_receive,
.post_send = c4iw_post_send,
.post_srq_recv = c4iw_post_srq_recv,
.query_device = c4iw_query_device,
.query_gid = c4iw_query_gid,
.query_port = c4iw_query_port,
.query_qp = c4iw_ib_query_qp,
.reg_user_mr = c4iw_reg_user_mr,
.req_notify_cq = c4iw_arm_cq,
INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq),
INIT_RDMA_OBJ_SIZE(ib_mw, c4iw_mw, ibmw),
INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
};
static int set_netdevs(struct ib_device *ib_dev, struct c4iw_rdev *rdev)
{
int ret;
int i;
for (i = 0; i < rdev->lldi.nports; i++) {
ret = ib_device_set_netdev(ib_dev, rdev->lldi.ports[i],
i + 1);
if (ret)
return ret;
}
return 0;
}
void c4iw_register_device(struct work_struct *work)
{
int ret;
struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work);
struct c4iw_dev *dev = ctx->dev;
pr_debug("c4iw_dev %p\n", dev);
memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
if (fastreg_support)
dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
dev->ibdev.local_dma_lkey = 0;
dev->ibdev.node_type = RDMA_NODE_RNIC;
BUILD_BUG_ON(sizeof(C4IW_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));
dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq;
dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name,
sizeof(dev->ibdev.iw_ifname));
rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
ret = set_netdevs(&dev->ibdev, &dev->rdev);
if (ret)
goto err_dealloc_ctx;
dma_set_max_seg_size(&dev->rdev.lldi.pdev->dev, UINT_MAX);
ret = ib_register_device(&dev->ibdev, "cxgb4_%d",
&dev->rdev.lldi.pdev->dev);
if (ret)
goto err_dealloc_ctx;
return;
err_dealloc_ctx:
pr_err("%s - Failed registering iwarp device: %d\n",
pci_name(ctx->lldi.pdev), ret);
c4iw_dealloc(ctx);
return;
}
void c4iw_unregister_device(struct c4iw_dev *dev)
{
pr_debug("c4iw_dev %p\n", dev);
ib_unregister_device(&dev->ibdev);
return;
}