From 51976c6cd786151b6a1bdf8b8b3334beac0ba99c Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 27 Sep 2024 18:33:22 +0800 Subject: [PATCH 01/51] RDMA/core: Provide rdma_user_mmap_disassociate() to disassociate mmap pages Provide a new api rdma_user_mmap_disassociate() for drivers to disassociate mmap pages for a device. Since drivers can now disassociate mmaps by calling this api, introduce a new disassociation_lock to specifically prevent races between this disassociation process and new mmaps. And thus the old hw_destroy_rwsem is not needed in this api. Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240927103323.1897094-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs.h | 2 ++ drivers/infiniband/core/uverbs_main.c | 43 +++++++++++++++++++++++++-- include/rdma/ib_verbs.h | 8 +++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 821d93c8f712..dfd2e5a86e6f 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -160,6 +160,8 @@ struct ib_uverbs_file { struct page *disassociate_page; struct xarray idr; + + struct mutex disassociation_lock; }; struct ib_uverbs_event { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 94454186ed81..85cfc790a7bb 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -76,6 +76,7 @@ static dev_t dynamic_uverbs_dev; static DEFINE_IDA(uverbs_ida); static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +static struct ib_client uverbs_client; static char *uverbs_devnode(const struct device *dev, umode_t *mode) { @@ -217,6 +218,7 @@ void ib_uverbs_release_file(struct kref *ref) if (file->disassociate_page) __free_pages(file->disassociate_page, 0); + mutex_destroy(&file->disassociation_lock); mutex_destroy(&file->umap_lock); mutex_destroy(&file->ucontext_lock); kfree(file); @@ -698,8 +700,13 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) ret = PTR_ERR(ucontext); goto out; } + + mutex_lock(&file->disassociation_lock); + vma->vm_ops = &rdma_umap_ops; ret = ucontext->device->ops.mmap(ucontext, vma); + + mutex_unlock(&file->disassociation_lock); out: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; @@ -721,6 +728,8 @@ static void rdma_umap_open(struct vm_area_struct *vma) /* We are racing with disassociation */ if (!down_read_trylock(&ufile->hw_destroy_rwsem)) goto out_zap; + mutex_lock(&ufile->disassociation_lock); + /* * Disassociation already completed, the VMA should already be zapped. */ @@ -732,10 +741,12 @@ static void rdma_umap_open(struct vm_area_struct *vma) goto out_unlock; rdma_umap_priv_init(priv, vma, opriv->entry); + mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); return; out_unlock: + mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); out_zap: /* @@ -819,7 +830,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) { struct rdma_umap_priv *priv, *next_priv; - lockdep_assert_held(&ufile->hw_destroy_rwsem); + mutex_lock(&ufile->disassociation_lock); while (1) { struct mm_struct *mm = NULL; @@ -845,8 +856,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) break; } mutex_unlock(&ufile->umap_lock); - if (!mm) + if (!mm) { + mutex_unlock(&ufile->disassociation_lock); return; + } /* * The umap_lock is nested under mmap_lock since it used within @@ -876,8 +889,32 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) mmap_read_unlock(mm); mmput(mm); } + + mutex_unlock(&ufile->disassociation_lock); } +/** + * rdma_user_mmap_disassociate() - Revoke mmaps for a device + * @device: device to revoke + * + * This function should be called by drivers that need to disable mmaps for the + * device, for instance because it is going to be reset. + */ +void rdma_user_mmap_disassociate(struct ib_device *device) +{ + struct ib_uverbs_device *uverbs_dev = + ib_get_client_data(device, &uverbs_client); + struct ib_uverbs_file *ufile; + + mutex_lock(&uverbs_dev->lists_mutex); + list_for_each_entry(ufile, &uverbs_dev->uverbs_file_list, list) { + if (ufile->ucontext) + uverbs_user_mmap_disassociate(ufile); + } + mutex_unlock(&uverbs_dev->lists_mutex); +} +EXPORT_SYMBOL(rdma_user_mmap_disassociate); + /* * ib_uverbs_open() does not need the BKL: * @@ -947,6 +984,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_init(&file->umap_lock); INIT_LIST_HEAD(&file->umaps); + mutex_init(&file->disassociation_lock); + filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index aa8ede439905..9cb8b5fe7eee 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2948,6 +2948,14 @@ int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, size_t length, u32 min_pgoff, u32 max_pgoff); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +void rdma_user_mmap_disassociate(struct ib_device *device); +#else +static inline void rdma_user_mmap_disassociate(struct ib_device *device) +{ +} +#endif + static inline int rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, From 615b94746a54702af923b28bd8a629f4ac0ff0d8 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 27 Sep 2024 18:33:23 +0800 Subject: [PATCH 02/51] RDMA/hns: Disassociate mmap pages for all uctx when HW is being reset When HW is being reset, userspace should not ring doorbell otherwise it may lead to abnormal consequence such as RAS. Disassociate mmap pages for all uctx to prevent userspace from ringing doorbell to HW. Since all resources will be destroyed during HW reset, no new mmap is allowed after HW reset is completed. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20240927103323.1897094-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++++ drivers/infiniband/hw/hns/hns_roce_main.c | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 24e906b9d3ae..f1feaa79f78e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7017,6 +7017,7 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } + static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; @@ -7035,6 +7036,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) hr_dev->active = false; hr_dev->dis_db = true; + + rdma_user_mmap_disassociate(&hr_dev->ib_dev); + hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN; return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 4cb0af733587..49315f39361d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -466,6 +466,11 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) pgprot_t prot; int ret; + if (hr_dev->dis_db) { + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]); + return -EPERM; + } + rdma_entry = rdma_user_mmap_entry_get_pgoff(uctx, vma->vm_pgoff); if (!rdma_entry) { atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]); From 89e9ae55dc56f322f1a123224ad9d3e52bcc3b50 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 8 Oct 2024 00:53:26 +0100 Subject: [PATCH 03/51] IB/hfi1: make clear_all_interrupts static clear_all_interrupts() in hw/hfi1/chip.c is currently global but only used in the same file, so make it static. There are also 'clear_all_interrupts' functions in i2c-nomadik and emif.c but fortunately they're already static. (Build and boot tested only, I don't have this hardware) Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241007235327.128613-1-linux@treblig.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/chip.c | 2 +- drivers/infiniband/hw/hfi1/chip.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index c52e6b2c9914..a442eca498b8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -13235,7 +13235,7 @@ int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) /* * Clear all interrupt sources on the chip. */ -void clear_all_interrupts(struct hfi1_devdata *dd) +static void clear_all_interrupts(struct hfi1_devdata *dd) { int i; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index d861aa8fc640..8841db16bde7 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1404,7 +1404,6 @@ irqreturn_t receive_context_interrupt_napi(int irq, void *data); int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); void init_qsfp_int(struct hfi1_devdata *dd); -void clear_all_interrupts(struct hfi1_devdata *dd); void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); void reset_interrupts(struct hfi1_devdata *dd); From 1e7b86f1b26ba5737481a403a26bb6182792a931 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Tue, 15 Oct 2024 17:42:41 +0000 Subject: [PATCH 04/51] RDMA/efa: Update device interface Update device interface header files. Link: https://patch.msgid.link/r/20241015174242.3490-2-mrgolin@amazon.com Reviewed-by: Daniel Kranzdorf Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Signed-off-by: Jason Gunthorpe --- .../infiniband/hw/efa/efa_admin_cmds_defs.h | 54 +++++++-- drivers/infiniband/hw/efa/efa_admin_defs.h | 4 +- drivers/infiniband/hw/efa/efa_com_cmd.c | 4 +- drivers/infiniband/hw/efa/efa_com_cmd.h | 2 +- drivers/infiniband/hw/efa/efa_io_defs.h | 106 ++++++++++++++++-- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- 6 files changed, 149 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index cd03a5429beb..88a9aee7e743 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -30,7 +30,8 @@ enum efa_admin_aq_opcode { EFA_ADMIN_DEALLOC_UAR = 17, EFA_ADMIN_CREATE_EQ = 18, EFA_ADMIN_DESTROY_EQ = 19, - EFA_ADMIN_MAX_OPCODE = 19, + EFA_ADMIN_ALLOC_MR = 20, + EFA_ADMIN_MAX_OPCODE = 20, }; enum efa_admin_aq_feature_id { @@ -150,8 +151,11 @@ struct efa_admin_create_qp_cmd { /* UAR number */ u16 uar; + /* Requested service level for the QP, 0 is the default SL */ + u8 sl; + /* MBZ */ - u16 reserved; + u8 reserved; /* MBZ */ u32 reserved2; @@ -459,6 +463,41 @@ struct efa_admin_dereg_mr_resp { struct efa_admin_acq_common_desc acq_common_desc; }; +/* + * Allocation of MemoryRegion, required for QP working with Virtual + * Addresses in kernel verbs semantics, ready for fast registration use. + */ +struct efa_admin_alloc_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain */ + u16 pd; + + /* MBZ */ + u16 reserved1; + + /* Maximum number of pages this MR supports. */ + u32 max_pages; +}; + +struct efa_admin_alloc_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * L_Key, to be used in conjunction with local buffer references in + * SQ and RQ WQE, or with virtual RQ/CQ rings + */ + u32 l_key; + + /* + * R_Key, to be used in RDMA messages to refer to remotely accessed + * memory region + */ + u32 r_key; +}; + struct efa_admin_create_cq_cmd { struct efa_admin_aq_common_desc aq_common_desc; @@ -483,8 +522,8 @@ struct efa_admin_create_cq_cmd { */ u8 cq_caps_2; - /* completion queue depth in # of entries. must be power of 2 */ - u16 cq_depth; + /* Sub completion queue depth in # of entries. must be power of 2 */ + u16 sub_cq_depth; /* EQ number assigned to this cq */ u16 eqn; @@ -519,8 +558,8 @@ struct efa_admin_create_cq_resp { u16 cq_idx; - /* actual cq depth in number of entries */ - u16 cq_actual_depth; + /* actual sub cq depth in number of entries */ + u16 sub_cq_actual_depth; /* CQ doorbell address, as offset to PCIe DB BAR */ u32 db_offset; @@ -578,6 +617,8 @@ struct efa_admin_basic_stats { u64 rx_pkts; u64 rx_drops; + + u64 qkey_viol; }; struct efa_admin_messages_stats { @@ -1057,7 +1098,6 @@ struct efa_admin_host_info { /* create_eq_cmd */ #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) -#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK BIT(6) #define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK BIT(0) /* host_info */ diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h index 83f20c38a840..35700c93e639 100644 --- a/drivers/infiniband/hw/efa/efa_admin_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_ADMIN_H_ @@ -96,7 +96,7 @@ struct efa_admin_acq_entry { struct efa_admin_aenq_common_desc { u16 group; - u16 syndrom; + u16 syndrome; /* * 0 : phase diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 5a774925cdea..206f377db27e 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -163,7 +163,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, EFA_SET(&create_cmd.cq_caps_2, EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS, params->entry_size_in_bytes / 4); - create_cmd.cq_depth = params->cq_depth; + create_cmd.sub_cq_depth = params->sub_cq_depth; create_cmd.num_sub_cqs = params->num_sub_cqs; create_cmd.uar = params->uarn; if (params->interrupt_mode_enabled) { @@ -191,7 +191,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, } result->cq_idx = cmd_completion.cq_idx; - result->actual_depth = params->cq_depth; + result->actual_depth = params->sub_cq_depth; result->db_off = cmd_completion.db_offset; result->db_valid = EFA_GET(&cmd_completion.flags, EFA_ADMIN_CREATE_CQ_RESP_DB_VALID); diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 668d033f7477..2599f8e58cc4 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -71,7 +71,7 @@ struct efa_com_create_cq_params { /* cq physical base address in OS memory */ dma_addr_t dma_addr; /* completion queue depth in # of entries */ - u16 cq_depth; + u16 sub_cq_depth; u16 num_sub_cqs; u16 uarn; u16 eqn; diff --git a/drivers/infiniband/hw/efa/efa_io_defs.h b/drivers/infiniband/hw/efa/efa_io_defs.h index 2d8eb96eaa81..a4c9fd33da38 100644 --- a/drivers/infiniband/hw/efa/efa_io_defs.h +++ b/drivers/infiniband/hw/efa/efa_io_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_IO_H_ @@ -10,6 +10,7 @@ #define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 #define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 #define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 +#define EFA_IO_TX_DESC_INLINE_PBL_SIZE 1 enum efa_io_queue_type { /* send queue (of a QP) */ @@ -25,6 +26,10 @@ enum efa_io_send_op_type { EFA_IO_RDMA_READ = 1, /* RDMA write */ EFA_IO_RDMA_WRITE = 2, + /* Fast MR registration */ + EFA_IO_FAST_REG = 3, + /* Fast MR invalidation */ + EFA_IO_FAST_INV = 4, }; enum efa_io_comp_status { @@ -34,15 +39,15 @@ enum efa_io_comp_status { EFA_IO_COMP_STATUS_FLUSHED = 1, /* Internal QP error */ EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, - /* Bad operation type */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, + /* Unsupported operation */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP = 3, /* Bad AH */ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, /* LKEY not registered or does not match IOVA */ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, /* Message too long */ EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, - /* Destination ENI is down or does not run EFA */ + /* RKEY not registered or does not match remote IOVA */ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, /* Connection was reset by remote side */ EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, @@ -54,8 +59,17 @@ enum efa_io_comp_status { EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, /* Unexpected status returned by responder */ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, - /* Unresponsive remote - detected locally */ + /* Unresponsive remote - was previously responsive */ EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13, + /* No valid AH at remote side (required for RDMA operations) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER = 14, + /* Unreachable remote - never received a response */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE = 15, +}; + +enum efa_io_frwr_pbl_mode { + EFA_IO_FRWR_INLINE_PBL = 0, + EFA_IO_FRWR_DIRECT_PBL = 1, }; struct efa_io_tx_meta_desc { @@ -95,13 +109,13 @@ struct efa_io_tx_meta_desc { /* * If inline_msg bit is set, length of inline message in bytes, - * otherwise length of SGL (number of buffers). + * otherwise length of SGL (number of buffers). */ u16 length; /* - * immediate data: if has_imm is set, then this field is included - * within Tx message and reported in remote Rx completion. + * immediate data: if has_imm is set, then this field is included within + * Tx message and reported in remote Rx completion. */ u32 immediate_data; @@ -158,6 +172,63 @@ struct efa_io_rdma_req { struct efa_io_tx_buf_desc local_mem[1]; }; +struct efa_io_fast_mr_reg_req { + /* Updated local key of the MR after lkey/rkey increment */ + u32 lkey; + + /* + * permissions + * 0 : local_write_enable - Local write permissions: + * must be set for RQ buffers and buffers posted for + * RDMA Read requests + * 1 : remote_write_enable - Remote write + * permissions: must be set to enable RDMA write to + * the region + * 2 : remote_read_enable - Remote read permissions: + * must be set to enable RDMA read from the region + * 7:3 : reserved2 - MBZ + */ + u8 permissions; + + /* + * control flags + * 4:0 : phys_page_size_shift - page size is (1 << + * phys_page_size_shift) + * 6:5 : pbl_mode - enum efa_io_frwr_pbl_mode + * 7 : reserved - MBZ + */ + u8 flags; + + /* MBZ */ + u8 reserved[2]; + + /* IO Virtual Address associated with this MR */ + u64 iova; + + /* Memory region length, in bytes */ + u64 mr_length; + + /* Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of physical page addresses (optimization + * for short region activation). + */ + u64 inline_array[1]; + + /* points to PBL (Currently only direct) */ + u64 dma_addr; + } pbl; +}; + +struct efa_io_fast_mr_inv_req { + /* Local key of the MR to invalidate */ + u32 lkey; + + /* MBZ */ + u8 reserved[28]; +}; + /* * Tx WQE, composed of tx meta descriptors followed by either tx buffer * descriptors or inline data @@ -174,6 +245,12 @@ struct efa_io_tx_wqe { /* RDMA local and remote memory addresses */ struct efa_io_rdma_req rdma_req; + + /* Fast registration */ + struct efa_io_fast_mr_reg_req reg_mr_req; + + /* Fast invalidation */ + struct efa_io_fast_mr_inv_req inv_mr_req; } data; }; @@ -208,7 +285,7 @@ struct efa_io_rx_desc { struct efa_io_cdesc_common { /* * verbs-generated request ID, as provided in the completed tx or rx - * descriptor. + * descriptor. */ u16 req_id; @@ -221,7 +298,8 @@ struct efa_io_cdesc_common { * 3 : has_imm - indicates that immediate data is * present - for RX completions only * 6:4 : op_type - enum efa_io_send_op_type - * 7 : reserved31 - MBZ + * 7 : unsolicited - indicates that there is no + * matching request - for RDMA with imm. RX only */ u8 flags; @@ -291,6 +369,13 @@ struct efa_io_rx_cdesc_ex { /* tx_buf_desc */ #define EFA_IO_TX_BUF_DESC_LKEY_MASK GENMASK(23, 0) +/* fast_mr_reg_req */ +#define EFA_IO_FAST_MR_REG_REQ_LOCAL_WRITE_ENABLE_MASK BIT(0) +#define EFA_IO_FAST_MR_REG_REQ_REMOTE_WRITE_ENABLE_MASK BIT(1) +#define EFA_IO_FAST_MR_REG_REQ_REMOTE_READ_ENABLE_MASK BIT(2) +#define EFA_IO_FAST_MR_REG_REQ_PHYS_PAGE_SIZE_SHIFT_MASK GENMASK(4, 0) +#define EFA_IO_FAST_MR_REG_REQ_PBL_MODE_MASK GENMASK(6, 5) + /* rx_desc */ #define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) #define EFA_IO_RX_DESC_FIRST_MASK BIT(30) @@ -301,5 +386,6 @@ struct efa_io_rx_cdesc_ex { #define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) #define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) #define EFA_IO_CDESC_COMMON_OP_TYPE_MASK GENMASK(6, 4) +#define EFA_IO_CDESC_COMMON_UNSOLICITED_MASK BIT(7) #endif /* _EFA_IO_H_ */ diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index cc13415ff7e7..feb04cfdb8da 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1167,7 +1167,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } params.uarn = cq->ucontext->uarn; - params.cq_depth = entries; + params.sub_cq_depth = entries; params.dma_addr = cq->dma_addr; params.entry_size_in_bytes = cmd.cq_entry_size; params.num_sub_cqs = cmd.num_sub_cqs; From 48931f65e9f785b65244550cc8f0c8bf9eab7acd Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Tue, 15 Oct 2024 17:42:42 +0000 Subject: [PATCH 05/51] RDMA/efa: Add option to set QP service level on create Using modify QP with AH attributes and IB_QP_AV flag set doesn't make much sense for connectionless QP types like SRD. Add SL parameter to EFA create QP user ABI and pass it to the device. Link: https://patch.msgid.link/r/20241015174242.3490-3-mrgolin@amazon.com Reviewed-by: Firas Jahjah Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 4 +++- include/uapi/rdma/efa-abi.h | 3 ++- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 206f377db27e..9e04edb9dbda 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -31,6 +31,7 @@ int efa_com_create_qp(struct efa_com_dev *edev, create_qp_cmd.qp_alloc_size.recv_queue_depth = params->rq_depth; create_qp_cmd.uar = params->uarn; + create_qp_cmd.sl = params->sl; if (params->unsolicited_write_recv) EFA_SET(&create_qp_cmd.flags, EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV, 1); diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 2599f8e58cc4..25f02c0d9698 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -27,6 +27,7 @@ struct efa_com_create_qp_params { u16 pd; u16 uarn; u8 qp_type; + u8 sl; u8 unsolicited_write_recv : 1; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index feb04cfdb8da..ca3af866a5df 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -676,7 +676,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, goto err_out; } - if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_90)) { + if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_98)) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, unknown fields in udata\n"); err = -EINVAL; @@ -732,6 +732,8 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, create_qp_params.rq_base_addr = qp->rq_dma_addr; } + create_qp_params.sl = cmd.sl; + if (cmd.flags & EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV) create_qp_params.unsolicited_write_recv = true; diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index d689b8b34189..11b94b0b035b 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -95,7 +95,8 @@ struct efa_ibv_create_qp { __u32 sq_ring_size; /* bytes */ __u32 driver_qp_type; __u16 flags; - __u8 reserved_90[6]; + __u8 sl; + __u8 reserved_98[5]; }; struct efa_ibv_create_qp_resp { From c11db1bf0ddc3cb60e6ee08e137eee10e19776b8 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 10 Oct 2024 13:16:19 +0300 Subject: [PATCH 06/51] RDMA/ipoib: Use the networking stack default for txqueuelen There is no need for a special txqueuelen value for IPoIB. This value represents the qdisc size which is not related to the SQ size, and the default value provided by the stack (DEFAULT_TX_QUEUE_LEN) is sufficient for typical use cases. Signed-off-by: Gal Pressman Link: https://patch.msgid.link/cc97764b5a8def4ea879b371549a5867fe75c756.1728555243.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 4e31bb0b6466..3b463db8ce39 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -2145,7 +2146,7 @@ void ipoib_setup_common(struct net_device *dev) dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; - dev->tx_queue_len = ipoib_sendq_size * 2; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); netif_keep_dst(dev); From 82c32d219272fc87680540ade235c1f08ad37626 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 16 Oct 2024 00:55:42 -0700 Subject: [PATCH 07/51] RDMA/bnxt_re: Add support for optimized modify QP Modify QP improvements are for state transitions from INIT -> RTR and RTR -> RTS. In order to support the Modify QP Optimization feature, the driver is expected to check for the feature support in the CMDQ_QUERY_FUNC and register its support for this feature with the FW in CMDQ_INITIALIZE_FIRMWARE. Additionally, the driver is required to specify the new fields and attribute masks for the transitions as follows: 1. INIT -> RTR: - New fields: srq_used, type. - enable srq_used when RC QP is configured to use SRQ. - set the type based on the QP type. - Mandatory masks: - RC: CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS, CMDQ_MODIFY_QP_MODIFY_MASK_PKEY - UD QP and QP1: CMDQ_MODIFY_QP_MODIFY_MASK_PKEY, CMDQ_MODIFY_QP_MODIFY_MASK_QKEY 2. RTR -> RTS: - New fields: type - set the type based on the QP type. - Mandatory masks: - RC: CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS - UD QP and QP1: CMDQ_MODIFY_QP_MODIFY_MASK_QKEY Reviewed-by: Saravanan Vajravel Reviewed-by: Tushar Rane Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1729065346-1364-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 40 ++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 6 +++- drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 +++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 2 ++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 42e98e5f94cb..ff2340c59fc1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1277,6 +1277,40 @@ static void __filter_modify_flags(struct bnxt_qplib_qp *qp) } } +static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, + struct cmdq_modify_qp *req) +{ + u32 mandatory_flags = 0; + + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS; + + if (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_INIT && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTR) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC && qp->srq) + req->flags = cpu_to_le16(CMDQ_MODIFY_QP_FLAGS_SRQ_USED); + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; + } + + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_UD || + qp->type == CMDQ_MODIFY_QP_QP_TYPE_GSI) + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY; + + qp->modify_flags |= mandatory_flags; + req->qp_type = qp->type; +} + +static bool is_optimized_state_transition(struct bnxt_qplib_qp *qp) +{ + if ((qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_INIT && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTR) || + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) + return true; + + return false; +} + int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; @@ -1293,6 +1327,12 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) /* Filter out the qp_attr_mask based on the state->new transition */ __filter_modify_flags(qp); + if (qp->modify_flags & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) { + /* Set mandatory attributes for INIT -> RTR and RTR -> RTS transition */ + if (_is_optimize_modify_qp_supported(res->dattr->dev_cap_flags2) && + is_optimized_state_transition(qp)) + bnxt_set_mandatory_attributes(qp, &req); + } bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); req.qp_cid = cpu_to_le32(qp->id); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3ffaef0c2651..f5713e3c39fb 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -832,6 +832,7 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct creq_initialize_fw_resp resp = {}; struct cmdq_initialize_fw req = {}; struct bnxt_qplib_cmdqmsg msg = {}; + u16 flags = 0; u8 pgsz, lvl; int rc; @@ -906,7 +907,10 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, skip_ctx_setup: if (BNXT_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) - req.flags |= cpu_to_le16(CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED); + flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; + if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + req.flags |= cpu_to_le16(flags); req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index c2f710364e0f..ef198a6fc85a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -576,4 +576,9 @@ static inline bool _is_relaxed_ordering_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED; } +static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 3ec895284e49..492417eb5896 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -216,6 +216,7 @@ struct cmdq_initialize_fw { __le16 flags; #define CMDQ_INITIALIZE_FW_FLAGS_MRAV_RESERVATION_SPLIT 0x1UL #define CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED 0x2UL + #define CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED 0x8UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -559,6 +560,7 @@ struct cmdq_modify_qp { #define CMDQ_MODIFY_QP_OPCODE_LAST CMDQ_MODIFY_QP_OPCODE_MODIFY_QP u8 cmd_size; __le16 flags; + #define CMDQ_MODIFY_QP_FLAGS_SRQ_USED 0x1UL __le16 cookie; u8 resp_size; u8 qp_type; From 9c4927caccf37240743fefdf1b4380fbe845aa39 Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Wed, 16 Oct 2024 00:55:44 -0700 Subject: [PATCH 08/51] RDMA/bnxt_re: Add support for CQ rx coalescing RoCE message rate performance is heavily degraded without the use of cq coalescing. With proper coalescing, message rates get better. Furthermore, coalescing significantly reduces contention on the PCIe Root Complex/Memory subsystems. Add the changes to configure CQ rx colascing parameters based on adapter revision when CQ is created. Signed-off-by: Chandramohan Akula Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1729065346-1364-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 8 ++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + drivers/infiniband/hw/bnxt_re/main.c | 9 +++++++++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 20 ++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_fp.h | 20 ++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 +++++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 14 +++++++++++++- 7 files changed, 76 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index e94518b12f86..bb28a1fe1430 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -156,6 +156,13 @@ struct bnxt_re_pacing { #define MAX_CQ_HASH_BITS (16) #define MAX_SRQ_HASH_BITS (16) + +static inline bool bnxt_re_chip_gen_p7(u16 chip_num) +{ + return (chip_num == CHIP_NUM_58818 || + chip_num == CHIP_NUM_57608); +} + struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; @@ -195,6 +202,7 @@ struct bnxt_re_dev { struct bnxt_qplib_ctx qplib_ctx; struct bnxt_qplib_res qplib_res; struct bnxt_qplib_dpi dpi_privileged; + struct bnxt_qplib_cq_coal_param cq_coalescing; struct mutex qp_lock; /* protect qp list */ struct list_head qp_list; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 460f33914825..55a3cc8aaf96 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3065,6 +3065,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->qplib_cq.max_wqe = entries; cq->qplib_cq.cnq_hw_ring_id = nq->ring_id; cq->qplib_cq.nq = nq; + cq->qplib_cq.coalescing = &rdev->cq_coalescing; rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 777068de4bbc..3a0181843dba 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -986,6 +986,15 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, atomic_set(&rdev->stats.res.pd_count, 0); rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; + rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME; + if (bnxt_re_chip_gen_p7(en_dev->chip_num)) { + rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7; + rdev->cq_coalescing.during_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P7; + } else { + rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P5; + rdev->cq_coalescing.during_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P5; + } + rdev->cq_coalescing.en_ring_idle_mode = BNXT_QPLIB_CQ_COAL_DEF_EN_RING_IDLE_MODE; return rdev; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index ff2340c59fc1..e2eea714e977 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2182,6 +2182,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_create_cq req = {}; struct bnxt_qplib_pbl *pbl; + u32 coalescing = 0; u32 pg_sz_lvl; int rc; @@ -2208,6 +2209,25 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->max_wqe); + + if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) { + req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID); + coalescing |= ((cq->coalescing->buf_maxtime << + CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) & + CMDQ_CREATE_CQ_BUF_MAXTIME_MASK); + coalescing |= ((cq->coalescing->normal_maxbuf << + CMDQ_CREATE_CQ_NORMAL_MAXBUF_SFT) & + CMDQ_CREATE_CQ_NORMAL_MAXBUF_MASK); + coalescing |= ((cq->coalescing->during_maxbuf << + CMDQ_CREATE_CQ_DURING_MAXBUF_SFT) & + CMDQ_CREATE_CQ_DURING_MAXBUF_MASK); + if (cq->coalescing->en_ring_idle_mode) + coalescing |= CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE; + else + coalescing &= ~CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE; + req.coalescing = cpu_to_le32(coalescing); + } + pbl = &cq->hwq.pbl[PBL_LVL_0]; pg_sz_lvl = (bnxt_qplib_base_pg_size(&cq->hwq) << CMDQ_CREATE_CQ_PG_SIZE_SFT); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index b62df8701950..fb01576e545d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -383,6 +383,25 @@ static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *que, return avail <= slots; } +/* CQ coalescing parameters */ +struct bnxt_qplib_cq_coal_param { + u16 buf_maxtime; + u8 normal_maxbuf; + u8 during_maxbuf; + u8 en_ring_idle_mode; +}; + +#define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7 0x8 +#define BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P7 0x8 +#define BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P5 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P5 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_EN_RING_IDLE_MODE 0x1 +#define BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME 0x1bf +#define BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF 0x1f +#define BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF 0x1f +#define BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE 0x1 + struct bnxt_qplib_cqe { u8 status; u8 type; @@ -445,6 +464,7 @@ struct bnxt_qplib_cq { */ spinlock_t flush_lock; /* QP flush management */ u16 cnq_events; + struct bnxt_qplib_cq_coal_param *coalescing; }; #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE sizeof(struct xrrq_irrq) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index ef198a6fc85a..115910c7e56d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -581,4 +581,9 @@ static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; } +static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 492417eb5896..a7679eedbf27 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -1139,6 +1139,7 @@ struct cmdq_create_cq { #define CMDQ_CREATE_CQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION 0x1UL #define CMDQ_CREATE_CQ_FLAGS_STEERING_TAG_VALID 0x2UL #define CMDQ_CREATE_CQ_FLAGS_INFINITE_CQ_MODE 0x4UL + #define CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID 0x8UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1171,7 +1172,18 @@ struct cmdq_create_cq { __le32 cq_size; __le64 pbl; __le16 steering_tag; - u8 reserved48[6]; + u8 reserved48[2]; + __le32 coalescing; + #define CMDQ_CREATE_CQ_BUF_MAXTIME_MASK 0x1ffUL + #define CMDQ_CREATE_CQ_BUF_MAXTIME_SFT 0 + #define CMDQ_CREATE_CQ_NORMAL_MAXBUF_MASK 0x3e00UL + #define CMDQ_CREATE_CQ_NORMAL_MAXBUF_SFT 9 + #define CMDQ_CREATE_CQ_DURING_MAXBUF_MASK 0x7c000UL + #define CMDQ_CREATE_CQ_DURING_MAXBUF_SFT 14 + #define CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE 0x80000UL + #define CMDQ_CREATE_CQ_UNUSED12_MASK 0xfff00000UL + #define CMDQ_CREATE_CQ_UNUSED12_SFT 20 + __le64 reserved64; }; /* creq_create_cq_resp (size:128b/16B) */ From 9a420bb2b4ff1563226ae7c9335d1b34f50c033f Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 16 Oct 2024 00:55:45 -0700 Subject: [PATCH 09/51] RDMA/bnxt_re: Add support for modify_device hook Adds support for modify_device in the driver for node desc changes. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1729065346-1364-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 16 ++++++++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 +++ drivers/infiniband/hw/bnxt_re/main.c | 1 + 3 files changed, 20 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 55a3cc8aaf96..2a21a908aa51 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -211,6 +211,22 @@ int bnxt_re_query_device(struct ib_device *ibdev, return 0; } +int bnxt_re_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + ibdev_dbg(ibdev, "Modify device with mask 0x%x", device_modify_mask); + + if (device_modify_mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + memcpy(ibdev->node_desc, device_modify->node_desc, IB_DEVICE_NODE_DESC_MAX); + return 0; +} + /* Port */ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index b789e47ec97a..83a584e2a16e 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -196,6 +196,9 @@ static inline bool bnxt_re_is_var_size_supported(struct bnxt_re_dev *rdev, int bnxt_re_query_device(struct ib_device *ibdev, struct ib_device_attr *ib_attr, struct ib_udata *udata); +int bnxt_re_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify); int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr); int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 3a0181843dba..d825eda6189e 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -911,6 +911,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .post_srq_recv = bnxt_re_post_srq_recv, .query_ah = bnxt_re_query_ah, .query_device = bnxt_re_query_device, + .modify_device = bnxt_re_modify_device, .query_pkey = bnxt_re_query_pkey, .query_port = bnxt_re_query_port, .query_qp = bnxt_re_query_qp, From 52f70dea4201f12683236a0d02c03ca4f6145382 Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Wed, 16 Oct 2024 00:55:46 -0700 Subject: [PATCH 10/51] RDMA/bnxt_re: Fix access flags for MR and QP modify Access flag definition in MR and QP is different in FW. Currently both reg/bind MR and modify/query QP uses the same flags. Add a different function to map the QP access flags for newer adapters. Signed-off-by: Kalesh AP Signed-off-by: Hongguang Gao Reviewed-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1729065346-1364-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 59 ++++++++++++++++++++---- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2a21a908aa51..e6108079dad5 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -94,9 +94,9 @@ static int __from_ib_access_flags(int iflags) return qflags; }; -static enum ib_access_flags __to_ib_access_flags(int qflags) +static int __to_ib_access_flags(int qflags) { - enum ib_access_flags iflags = 0; + int iflags = 0; if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE) iflags |= IB_ACCESS_LOCAL_WRITE; @@ -113,7 +113,49 @@ static enum ib_access_flags __to_ib_access_flags(int qflags) if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND) iflags |= IB_ACCESS_ON_DEMAND; return iflags; -}; +} + +static u8 __qp_access_flags_from_ib(struct bnxt_qplib_chip_ctx *cctx, int iflags) +{ + u8 qflags = 0; + + if (!bnxt_qplib_is_chip_gen_p5_p7(cctx)) + /* For Wh+ */ + return (u8)__from_ib_access_flags(iflags); + + /* For P5, P7 and later chips */ + if (iflags & IB_ACCESS_LOCAL_WRITE) + qflags |= CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE; + if (iflags & IB_ACCESS_REMOTE_WRITE) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; + if (iflags & IB_ACCESS_REMOTE_READ) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; + if (iflags & IB_ACCESS_REMOTE_ATOMIC) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC; + + return qflags; +} + +static int __qp_access_flags_to_ib(struct bnxt_qplib_chip_ctx *cctx, u8 qflags) +{ + int iflags = 0; + + if (!bnxt_qplib_is_chip_gen_p5_p7(cctx)) + /* For Wh+ */ + return __to_ib_access_flags(qflags); + + /* For P5, P7 and later chips */ + if (qflags & CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE) + iflags |= IB_ACCESS_LOCAL_WRITE; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE) + iflags |= IB_ACCESS_REMOTE_WRITE; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_READ) + iflags |= IB_ACCESS_REMOTE_READ; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC) + iflags |= IB_ACCESS_REMOTE_ATOMIC; + + return iflags; +} static void bnxt_re_check_and_set_relaxed_ordering(struct bnxt_re_dev *rdev, struct bnxt_qplib_mrw *qplib_mr) @@ -2053,12 +2095,10 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS; qp->qplib_qp.access = - __from_ib_access_flags(qp_attr->qp_access_flags); + __qp_access_flags_from_ib(qp->qplib_qp.cctx, + qp_attr->qp_access_flags); /* LOCAL_WRITE access must be set to allow RC receive */ - qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE; - /* Temp: Set all params on QP as of now */ - qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; - qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; + qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE; } if (qp_attr_mask & IB_QP_PKEY_INDEX) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; @@ -2263,7 +2303,8 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state); qp_attr->cur_qp_state = __to_ib_qp_state(qplib_qp->cur_qp_state); qp_attr->en_sqd_async_notify = qplib_qp->en_sqd_async_notify ? 1 : 0; - qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp->access); + qp_attr->qp_access_flags = __qp_access_flags_to_ib(qp->qplib_qp.cctx, + qplib_qp->access); qp_attr->pkey_index = qplib_qp->pkey_index; qp_attr->qkey = qplib_qp->qkey; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; From 427b1f3bba2aa9322561dc3448aacc15a1f68284 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Sun, 20 Oct 2024 18:15:43 -0700 Subject: [PATCH 11/51] RDMA: Use ethtool string helpers Avoids having to manually increment the pointer. Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20241021011543.5922-1-rosenp@gmail.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 9 +++------ drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c | 4 +--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 7da94fb8d7fa..4feb7170535c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -128,16 +128,13 @@ static void ipoib_get_ethtool_stats(struct net_device *dev, static void ipoib_get_strings(struct net_device __always_unused *dev, u32 stringset, u8 *data) { - u8 *p = data; int i; switch (stringset) { case ETH_SS_STATS: - for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) { - memcpy(p, ipoib_gstrings_stats[i].stat_string, - ETH_GSTRING_LEN); - p += ETH_GSTRING_LEN; - } + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) + ethtool_puts(&data, + ipoib_gstrings_stats[i].stat_string); break; default: break; diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c index 29b3d8fce3f5..316959940d2f 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c @@ -164,9 +164,7 @@ static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data) return; for (i = 0; i < VNIC_STATS_LEN; i++) - memcpy(data + i * ETH_GSTRING_LEN, - vnic_gstrings_stats[i].stat_string, - ETH_GSTRING_LEN); + ethtool_puts(&data, vnic_gstrings_stats[i].stat_string); } /* ethtool ops */ From 571e4ab8a45e530623ab129803f090a844dd3fe9 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Thu, 24 Oct 2024 20:39:56 +0800 Subject: [PATCH 12/51] RDMA/hns: Fix an AEQE overflow error caused by untimely update of eq_db_ci eq_db_ci is updated only after all AEQEs are processed in the AEQ interrupt handler, which is not timely enough and may result in AEQ overflow. Two optimization methods are proposed: 1. Set an upper limit for AEQE processing. 2. Move time-consuming operations such as printings to the bottom half of the interrupt. cmd events and flush_cqe events are still fully processed in the top half to ensure timely handling. Fixes: a5073d6054f7 ("RDMA/hns: Add eq support of hip08") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241024124000.2931869-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 75 ++++++++++++++------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 ++ drivers/infiniband/hw/hns/hns_roce_qp.c | 56 +++++++++------ 4 files changed, 92 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 0b1e21cb6d2d..73c78005901e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1289,6 +1289,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type); void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp); void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type); +void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f1feaa79f78e..bbdeb02102e8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5967,11 +5967,10 @@ static int hns_roce_v2_query_mpt(struct hns_roce_dev *hr_dev, u32 key, return ret; } -static void hns_roce_irq_work_handle(struct work_struct *work) +static void dump_aeqe_log(struct hns_roce_work *irq_work) { - struct hns_roce_work *irq_work = - container_of(work, struct hns_roce_work, work); - struct ib_device *ibdev = &irq_work->hr_dev->ib_dev; + struct hns_roce_dev *hr_dev = irq_work->hr_dev; + struct ib_device *ibdev = &hr_dev->ib_dev; switch (irq_work->event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: @@ -6015,6 +6014,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work) case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: ibdev_warn(ibdev, "DB overflow.\n"); break; + case HNS_ROCE_EVENT_TYPE_MB: + break; case HNS_ROCE_EVENT_TYPE_FLR: ibdev_warn(ibdev, "function level reset.\n"); break; @@ -6025,8 +6026,46 @@ static void hns_roce_irq_work_handle(struct work_struct *work) ibdev_err(ibdev, "invalid xrceth error.\n"); break; default: + ibdev_info(ibdev, "Undefined event %d.\n", + irq_work->event_type); break; } +} + +static void hns_roce_irq_work_handle(struct work_struct *work) +{ + struct hns_roce_work *irq_work = + container_of(work, struct hns_roce_work, work); + struct hns_roce_dev *hr_dev = irq_work->hr_dev; + int event_type = irq_work->event_type; + u32 queue_num = irq_work->queue_num; + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + case HNS_ROCE_EVENT_TYPE_COMM_EST: + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION: + case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH: + hns_roce_qp_event(hr_dev, queue_num, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + hns_roce_srq_event(hr_dev, queue_num, event_type); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + hns_roce_cq_event(hr_dev, queue_num, event_type); + break; + default: + break; + } + + dump_aeqe_log(irq_work); kfree(irq_work); } @@ -6087,14 +6126,14 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) { - struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); irqreturn_t aeqe_found = IRQ_NONE; + int num_aeqes = 0; int event_type; u32 queue_num; int sub_type; - while (aeqe) { + while (aeqe && num_aeqes < HNS_AEQ_POLLING_BUDGET) { /* Make sure we read AEQ entry after we have checked the * ownership bit */ @@ -6105,25 +6144,12 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, queue_num = hr_reg_read(aeqe, AEQE_EVENT_QUEUE_NUM); switch (event_type) { - case HNS_ROCE_EVENT_TYPE_PATH_MIG: - case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - case HNS_ROCE_EVENT_TYPE_COMM_EST: - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION: case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH: - hns_roce_qp_event(hr_dev, queue_num, event_type); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: - case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - hns_roce_srq_event(hr_dev, queue_num, event_type); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - hns_roce_cq_event(hr_dev, queue_num, event_type); + hns_roce_flush_cqe(hr_dev, queue_num); break; case HNS_ROCE_EVENT_TYPE_MB: hns_roce_cmd_event(hr_dev, @@ -6131,12 +6157,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, aeqe->event.cmd.status, le64_to_cpu(aeqe->event.cmd.out_param)); break; - case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - case HNS_ROCE_EVENT_TYPE_FLR: - break; default: - dev_err(dev, "unhandled event %d on EQ %d at idx %u.\n", - event_type, eq->eqn, eq->cons_index); break; } @@ -6150,6 +6171,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); aeqe = next_aeqe_sw_v2(eq); + ++num_aeqes; } update_eq_db(eq); @@ -6699,6 +6721,9 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) int ret; int i; + if (hr_dev->caps.aeqe_depth < HNS_AEQ_POLLING_BUDGET) + return -EINVAL; + other_num = hr_dev->caps.num_other_vectors; comp_num = hr_dev->caps.num_comp_vectors; aeq_num = hr_dev->caps.num_aeq_vectors; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index c65f68a14a26..3b3c6259ace0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -85,6 +85,11 @@ #define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18) +/* budget must be smaller than aeqe_depth to guarantee that we update + * the ci before we polled all the entries in the EQ. + */ +#define HNS_AEQ_POLLING_BUDGET 64 + enum { HNS_ROCE_CMD_FLAG_IN = BIT(0), HNS_ROCE_CMD_FLAG_OUT = BIT(1), diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 6b03ba671ff8..dcaa370d4a26 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -39,6 +39,25 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" +static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, + u32 qpn) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_qp *qp; + unsigned long flags; + + xa_lock_irqsave(&hr_dev->qp_table_xa, flags); + qp = __hns_roce_qp_lookup(hr_dev, qpn); + if (qp) + refcount_inc(&qp->refcount); + xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags); + + if (!qp) + dev_warn(dev, "async event for bogus QP %08x\n", qpn); + + return qp; +} + static void flush_work_handle(struct work_struct *work) { struct hns_roce_work *flush_work = container_of(work, @@ -95,29 +114,11 @@ void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp) void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) { - struct device *dev = hr_dev->dev; struct hns_roce_qp *qp; - xa_lock(&hr_dev->qp_table_xa); - qp = __hns_roce_qp_lookup(hr_dev, qpn); - if (qp) - refcount_inc(&qp->refcount); - xa_unlock(&hr_dev->qp_table_xa); - - if (!qp) { - dev_warn(dev, "async event for bogus QP %08x\n", qpn); + qp = hns_roce_qp_lookup(hr_dev, qpn); + if (!qp) return; - } - - if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION || - event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH) { - qp->state = IB_QPS_ERR; - - flush_cqe(hr_dev, qp); - } qp->event(qp, (enum hns_roce_event)event_type); @@ -125,6 +126,21 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) complete(&qp->free); } +void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn) +{ + struct hns_roce_qp *qp; + + qp = hns_roce_qp_lookup(hr_dev, qpn); + if (!qp) + return; + + qp->state = IB_QPS_ERR; + flush_cqe(hr_dev, qp); + + if (refcount_dec_and_test(&qp->refcount)) + complete(&qp->free); +} + static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp, enum hns_roce_event type) { From 377a2097705b915325a67e4d44f9f2844e567809 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Thu, 24 Oct 2024 20:39:57 +0800 Subject: [PATCH 13/51] RDMA/hns: Fix flush cqe error when racing with destroy qp QP needs to be modified to IB_QPS_ERROR to trigger HW flush cqe. But when this process races with destroy qp, the destroy-qp process may modify the QP to IB_QPS_RESET first. In this case flush cqe will fail since it is invalid to modify qp from IB_QPS_RESET to IB_QPS_ERROR. Add lock and bit flag to make sure pending flush cqe work is completed first and no more new works will be added. Fixes: ffd541d45726 ("RDMA/hns: Add the workqueue framework for flush cqe handler") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241024124000.2931869-3-huangjunxian6@hisilicon.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 +++++++ drivers/infiniband/hw/hns/hns_roce_qp.c | 15 +++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 73c78005901e..9b51d5a1533f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -593,6 +593,7 @@ struct hns_roce_dev; enum { HNS_ROCE_FLUSH_FLAG = 0, + HNS_ROCE_STOP_FLUSH_FLAG = 1, }; struct hns_roce_work { @@ -656,6 +657,7 @@ struct hns_roce_qp { enum hns_roce_cong_type cong_type; u8 tc_mode; u8 priority; + spinlock_t flush_lock; }; struct hns_roce_ib_iboe { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index bbdeb02102e8..4c3bc1f6a183 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5598,8 +5598,15 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + unsigned long flags; int ret; + /* Make sure flush_cqe() is completed */ + spin_lock_irqsave(&hr_qp->flush_lock, flags); + set_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag); + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); + flush_work(&hr_qp->flush_work.work); + ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata); if (ret) ibdev_err(&hr_dev->ib_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index dcaa370d4a26..2ad03ecdbf8e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -90,11 +90,18 @@ static void flush_work_handle(struct work_struct *work) void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct hns_roce_work *flush_work = &hr_qp->flush_work; + unsigned long flags; + + spin_lock_irqsave(&hr_qp->flush_lock, flags); + /* Exit directly after destroy_qp() */ + if (test_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag)) { + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); + return; + } - flush_work->hr_dev = hr_dev; - INIT_WORK(&flush_work->work, flush_work_handle); refcount_inc(&hr_qp->refcount); queue_work(hr_dev->irq_workq, &flush_work->work); + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); } void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp) @@ -1140,6 +1147,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, struct ib_udata *udata, struct hns_roce_qp *hr_qp) { + struct hns_roce_work *flush_work = &hr_qp->flush_work; struct hns_roce_ib_create_qp_resp resp = {}; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_ib_create_qp ucmd = {}; @@ -1148,9 +1156,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, mutex_init(&hr_qp->mutex); spin_lock_init(&hr_qp->sq.lock); spin_lock_init(&hr_qp->rq.lock); + spin_lock_init(&hr_qp->flush_lock); hr_qp->state = IB_QPS_RESET; hr_qp->flush_flag = 0; + flush_work->hr_dev = hr_dev; + INIT_WORK(&flush_work->work, flush_work_handle); if (init_attr->create_flags) return -EOPNOTSUPP; From 370a9351bf84afc5a56a3f02ba3805bbfcb53c32 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Thu, 24 Oct 2024 20:39:58 +0800 Subject: [PATCH 14/51] RDMA/hns: Modify debugfs name The sub-directory of hns_roce debugfs is named after the device's kernel name currently, but it will be inconvenient to use when the device is renamed. Modify the name to pci name as users can always easily find the correspondence between an RDMA device and its pci name. Fixes: eb7854d63db5 ("RDMA/hns: Support SW stats with debugfs") Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241024124000.2931869-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c index e8febb40f645..b869cdc54118 100644 --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c @@ -5,6 +5,7 @@ #include #include +#include #include "hns_roce_device.h" @@ -86,7 +87,7 @@ void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev) { struct hns_roce_dev_debugfs *dbgfs = &hr_dev->dbgfs; - dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev), + dbgfs->root = debugfs_create_dir(pci_name(hr_dev->pci_dev), hns_roce_dbgfs_root); create_sw_stat_debugfs(hr_dev, dbgfs->root); From d81fb6511abf18591befaa5f4a972ffc838690ec Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Thu, 24 Oct 2024 20:39:59 +0800 Subject: [PATCH 15/51] RDMA/hns: Use dev_* printings in hem code instead of ibdev_* The hem code is executed before ib_dev is registered, so use dev_* printing instead of ibdev_* to avoid log like this: (null): set HEM address to HW failed! Fixes: 2f49de21f3e9 ("RDMA/hns: Optimize mhop get flow for multi-hop addressing") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241024124000.2931869-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 44 ++++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index c7c167e2a045..ee5d2c1bb5ca 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -300,7 +300,7 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; unsigned long mhop_obj = obj; u32 l0_idx, l1_idx, l2_idx; u32 chunk_ba_num; @@ -331,14 +331,14 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev, index->buf = l0_idx; break; default: - ibdev_err(ibdev, "table %u not support mhop.hop_num = %u!\n", - table->type, mhop->hop_num); + dev_err(dev, "table %u not support mhop.hop_num = %u!\n", + table->type, mhop->hop_num); return -EINVAL; } if (unlikely(index->buf >= table->num_hem)) { - ibdev_err(ibdev, "table %u exceed hem limt idx %llu, max %lu!\n", - table->type, index->buf, table->num_hem); + dev_err(dev, "table %u exceed hem limt idx %llu, max %lu!\n", + table->type, index->buf, table->num_hem); return -EINVAL; } @@ -448,14 +448,14 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; u32 step_idx; int ret = 0; if (index->inited & HEM_INDEX_L0) { ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0); if (ret) { - ibdev_err(ibdev, "set HEM step 0 failed!\n"); + dev_err(dev, "set HEM step 0 failed!\n"); goto out; } } @@ -463,7 +463,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, if (index->inited & HEM_INDEX_L1) { ret = hr_dev->hw->set_hem(hr_dev, table, obj, 1); if (ret) { - ibdev_err(ibdev, "set HEM step 1 failed!\n"); + dev_err(dev, "set HEM step 1 failed!\n"); goto out; } } @@ -475,7 +475,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, step_idx = mhop->hop_num; ret = hr_dev->hw->set_hem(hr_dev, table, obj, step_idx); if (ret) - ibdev_err(ibdev, "set HEM step last failed!\n"); + dev_err(dev, "set HEM step last failed!\n"); } out: return ret; @@ -485,14 +485,14 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, unsigned long obj) { - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_hem_index index = {}; struct hns_roce_hem_mhop mhop = {}; + struct device *dev = hr_dev->dev; int ret; ret = calc_hem_config(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "calc hem config failed!\n"); + dev_err(dev, "calc hem config failed!\n"); return ret; } @@ -504,7 +504,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, ret = alloc_mhop_hem(hr_dev, table, &mhop, &index); if (ret) { - ibdev_err(ibdev, "alloc mhop hem failed!\n"); + dev_err(dev, "alloc mhop hem failed!\n"); goto out; } @@ -512,7 +512,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, if (table->type < HEM_TYPE_MTT) { ret = set_mhop_hem(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "set HEM address to HW failed!\n"); + dev_err(dev, "set HEM address to HW failed!\n"); goto err_alloc; } } @@ -575,7 +575,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; u32 hop_num = mhop->hop_num; u32 chunk_ba_num; u32 step_idx; @@ -605,21 +605,21 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx); if (ret) - ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n", - hop_num, ret); + dev_warn(dev, "failed to clear hop%u HEM, ret = %d.\n", + hop_num, ret); if (index->inited & HEM_INDEX_L1) { ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1); if (ret) - ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n", - ret); + dev_warn(dev, "failed to clear HEM step 1, ret = %d.\n", + ret); } if (index->inited & HEM_INDEX_L0) { ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); if (ret) - ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n", - ret); + dev_warn(dev, "failed to clear HEM step 0, ret = %d.\n", + ret); } } } @@ -629,14 +629,14 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev, unsigned long obj, int check_refcount) { - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_hem_index index = {}; struct hns_roce_hem_mhop mhop = {}; + struct device *dev = hr_dev->dev; int ret; ret = calc_hem_config(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "calc hem config failed!\n"); + dev_err(dev, "calc hem config failed!\n"); return; } From 323275ac2ff15b2b7b3eac391ae5d8c5a3c3a999 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Thu, 24 Oct 2024 20:40:00 +0800 Subject: [PATCH 16/51] RDMA/hns: Fix cpu stuck caused by printings during reset During reset, cmd to destroy resources such as qp, cq, and mr may fail, and error logs will be printed. When a large number of resources are destroyed, there will be lots of printings, and it may lead to a cpu stuck. Delete some unnecessary printings and replace other printing functions in these paths with the ratelimited version. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Fixes: c7bcb13442e1 ("RDMA/hns: Add SRQ support for hip08 kernel mode") Fixes: 70f92521584f ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT") Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241024124000.2931869-6-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_cq.c | 4 +- drivers/infiniband/hw/hns/hns_roce_hem.c | 4 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 73 ++++++++++------------ drivers/infiniband/hw/hns/hns_roce_mr.c | 4 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 4 +- 5 files changed, 41 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 4ec66611a143..4106423a1b39 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -179,8 +179,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC, hr_cq->cqn); if (ret) - dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, - hr_cq->cqn); + dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", + ret, hr_cq->cqn); xa_erase_irq(&cq_table->array, hr_cq->cqn); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index ee5d2c1bb5ca..f84521be3bea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -672,8 +672,8 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); if (ret) - dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", - ret); + dev_warn_ratelimited(dev, "failed to clear HEM base address, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4c3bc1f6a183..d1c075fb0ad8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -373,19 +373,12 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { - struct ib_device *ibdev = &hr_dev->ib_dev; - if (unlikely(hr_qp->state == IB_QPS_RESET || hr_qp->state == IB_QPS_INIT || - hr_qp->state == IB_QPS_RTR)) { - ibdev_err(ibdev, "failed to post WQE, QP state %u!\n", - hr_qp->state); + hr_qp->state == IB_QPS_RTR)) return -EINVAL; - } else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) { - ibdev_err(ibdev, "failed to post WQE, dev state %d!\n", - hr_dev->state); + else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) return -EIO; - } return 0; } @@ -2775,8 +2768,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, IB_QPS_INIT, NULL); if (ret) { - ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, "failed to modify qp to init, ret = %d.\n", + ret); return ret; } @@ -3421,8 +3414,8 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp) ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr); if (ret) { - ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, "failed to post wqe for free mr, ret = %d.\n", + ret); return ret; } @@ -3461,9 +3454,9 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { - ibdev_err(ibdev, - "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", - hr_qp->qpn, ret); + ibdev_err_ratelimited(ibdev, + "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", + hr_qp->qpn, ret); break; } @@ -3474,16 +3467,16 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) while (cqe_cnt) { npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { - ibdev_err(ibdev, - "failed to poll cqe for free mr, remain %d cqe.\n", - cqe_cnt); + ibdev_err_ratelimited(ibdev, + "failed to poll cqe for free mr, remain %d cqe.\n", + cqe_cnt); goto out; } if (time_after(jiffies, end)) { - ibdev_err(ibdev, - "failed to poll cqe for free mr and timeout, remain %d cqe.\n", - cqe_cnt); + ibdev_err_ratelimited(ibdev, + "failed to poll cqe for free mr and timeout, remain %d cqe.\n", + cqe_cnt); goto out; } cqe_cnt -= npolled; @@ -5061,10 +5054,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; - if (!check_qp_state(cur_state, new_state)) { - ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n"); + if (!check_qp_state(cur_state, new_state)) return -EINVAL; - } if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { memset(qpc_mask, 0, hr_dev->caps.qpc_sz); @@ -5325,7 +5316,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, /* SW pass context to HW */ ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp); if (ret) { - ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret); goto out; } @@ -5463,7 +5454,9 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context); if (ret) { - ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, + "failed to query QPC, ret = %d.\n", + ret); ret = -EINVAL; goto out; } @@ -5471,7 +5464,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, state = hr_reg_read(&context, QPC_QP_ST); tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state); if (tmp_qp_state == -1) { - ibdev_err(ibdev, "Illegal ib_qp_state\n"); + ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n"); ret = -EINVAL; goto out; } @@ -5564,9 +5557,9 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET, udata); if (ret) - ibdev_err(ibdev, - "failed to modify QP to RST, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, + "failed to modify QP to RST, ret = %d.\n", + ret); } send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL; @@ -5609,9 +5602,9 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", - hr_qp->qpn, ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", + hr_qp->qpn, ret); hns_roce_qp_destroy(hr_dev, hr_qp, udata); @@ -5905,9 +5898,9 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when modifying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when modifying CQ, ret = %d.\n", + ret); err_out: if (ret) @@ -5931,9 +5924,9 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn, ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_CQC, cqn); if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when querying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when querying CQ, ret = %d.\n", + ret); goto err_mailbox; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 846da8c78b8b..b3f4327d0e64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -138,8 +138,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) - ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n", - ret); + ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n", + ret); } free_mr_pbl(hr_dev, mr); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index c9b8233f4b05..70c06ef65603 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -151,8 +151,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ, srq->srqn); if (ret) - dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", - ret, srq->srqn); + dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", + ret, srq->srqn); xa_erase_irq(&srq_table->xa, srq->srqn); From ea4c990fa9e19ffef0648e40c566b94ba5ab31be Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 25 Oct 2024 17:20:36 +0200 Subject: [PATCH 17/51] RDMA/rxe: Fix the qp flush warnings in req When the qp is in error state, the status of WQEs in the queue should be set to error. Or else the following will appear. [ 920.617269] WARNING: CPU: 1 PID: 21 at drivers/infiniband/sw/rxe/rxe_comp.c:756 rxe_completer+0x989/0xcc0 [rdma_rxe] [ 920.617744] Modules linked in: rnbd_client(O) rtrs_client(O) rtrs_core(O) rdma_ucm rdma_cm iw_cm ib_cm crc32_generic rdma_rxe ip6_udp_tunnel udp_tunnel ib_uverbs ib_core loop brd null_blk ipv6 [ 920.618516] CPU: 1 PID: 21 Comm: ksoftirqd/1 Tainted: G O 6.1.113-storage+ #65 [ 920.618986] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 920.619396] RIP: 0010:rxe_completer+0x989/0xcc0 [rdma_rxe] [ 920.619658] Code: 0f b6 84 24 3a 02 00 00 41 89 84 24 44 04 00 00 e9 2a f7 ff ff 39 ca bb 03 00 00 00 b8 0e 00 00 00 48 0f 45 d8 e9 15 f7 ff ff <0f> 0b e9 cb f8 ff ff 41 bf f5 ff ff ff e9 08 f8 ff ff 49 8d bc 24 [ 920.620482] RSP: 0018:ffff97b7c00bbc38 EFLAGS: 00010246 [ 920.620817] RAX: 0000000000000000 RBX: 000000000000000c RCX: 0000000000000008 [ 920.621183] RDX: ffff960dc396ebc0 RSI: 0000000000005400 RDI: ffff960dc4e2fbac [ 920.621548] RBP: 0000000000000000 R08: 0000000000000001 R09: ffffffffac406450 [ 920.621884] R10: ffffffffac4060c0 R11: 0000000000000001 R12: ffff960dc4e2f800 [ 920.622254] R13: ffff960dc4e2f928 R14: ffff97b7c029c580 R15: 0000000000000000 [ 920.622609] FS: 0000000000000000(0000) GS:ffff960ef7d00000(0000) knlGS:0000000000000000 [ 920.622979] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 920.623245] CR2: 00007fa056965e90 CR3: 00000001107f1000 CR4: 00000000000006e0 [ 920.623680] Call Trace: [ 920.623815] [ 920.623933] ? __warn+0x79/0xc0 [ 920.624116] ? rxe_completer+0x989/0xcc0 [rdma_rxe] [ 920.624356] ? report_bug+0xfb/0x150 [ 920.624594] ? handle_bug+0x3c/0x60 [ 920.624796] ? exc_invalid_op+0x14/0x70 [ 920.624976] ? asm_exc_invalid_op+0x16/0x20 [ 920.625203] ? rxe_completer+0x989/0xcc0 [rdma_rxe] [ 920.625474] ? rxe_completer+0x329/0xcc0 [rdma_rxe] [ 920.625749] rxe_do_task+0x80/0x110 [rdma_rxe] [ 920.626037] rxe_requester+0x625/0xde0 [rdma_rxe] [ 920.626310] ? rxe_cq_post+0xe2/0x180 [rdma_rxe] [ 920.626583] ? do_complete+0x18d/0x220 [rdma_rxe] [ 920.626812] ? rxe_completer+0x1a3/0xcc0 [rdma_rxe] [ 920.627050] rxe_do_task+0x80/0x110 [rdma_rxe] [ 920.627285] tasklet_action_common.constprop.0+0xa4/0x120 [ 920.627522] handle_softirqs+0xc2/0x250 [ 920.627728] ? sort_range+0x20/0x20 [ 920.627942] run_ksoftirqd+0x1f/0x30 [ 920.628158] smpboot_thread_fn+0xc7/0x1b0 [ 920.628334] kthread+0xd6/0x100 [ 920.628504] ? kthread_complete_and_exit+0x20/0x20 [ 920.628709] ret_from_fork+0x1f/0x30 [ 920.628892] Fixes: ae720bdb703b ("RDMA/rxe: Generate error completion for error requester QP state") Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20241025152036.121417-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_req.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 479c07e6e4ed..87a02f0deb00 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -663,10 +663,12 @@ int rxe_requester(struct rxe_qp *qp) if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = __req_next_wqe(qp); spin_unlock_irqrestore(&qp->state_lock, flags); - if (wqe) + if (wqe) { + wqe->status = IB_WC_WR_FLUSH_ERR; goto err; - else + } else { goto exit; + } } if (unlikely(qp_state(qp) == IB_QPS_RESET)) { From 808ca6de989c598bc5af1ae0ad971a66077efac0 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Mon, 28 Oct 2024 03:06:54 -0700 Subject: [PATCH 18/51] RDMA/bnxt_re: Check cqe flags to know imm_data vs inv_irkey Invalidate rkey is cpu endian and immediate data is in big endian format. Both immediate data and invalidate the remote key returned by HW is in little endian format. While handling the commit in fixes tag, the difference between immediate data and invalidate rkey endianness was not considered. Without changes of this patch, Kernel ULP was failing while processing inv_rkey. dmesg log snippet - nvme nvme0: Bogus remote invalidation for rkey 0x2000019Fix in this patch Do endianness conversion based on completion queue entry flag. Also, the HW completions are already converted to host endianness in bnxt_qplib_cq_process_res_rc and bnxt_qplib_cq_process_res_ud and there is no need to convert it again in bnxt_re_poll_cq. Modified the union to hold the correct data type. Fixes: 95b087f87b78 ("bnxt_re: Fix imm_data endianness") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730110014-20755-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 7 +++++-- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e6108079dad5..09e61553e43f 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3687,7 +3687,7 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *gsi_sqp, wc->byte_len = orig_cqe->length; wc->qp = &gsi_qp->ib_qp; - wc->ex.imm_data = cpu_to_be32(le32_to_cpu(orig_cqe->immdata)); + wc->ex.imm_data = cpu_to_be32(orig_cqe->immdata); wc->src_qp = orig_cqe->src_qp; memcpy(wc->smac, orig_cqe->smac, ETH_ALEN); if (bnxt_re_is_vlan_pkt(orig_cqe, &vlan_id, &sl)) { @@ -3832,7 +3832,10 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) (unsigned long)(cqe->qp_handle), struct bnxt_re_qp, qplib_qp); wc->qp = &qp->ib_qp; - wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->immdata)); + if (cqe->flags & CQ_RES_RC_FLAGS_IMM) + wc->ex.imm_data = cpu_to_be32(cqe->immdata); + else + wc->ex.invalidate_rkey = cqe->invrkey; wc->src_qp = cqe->src_qp; memcpy(wc->smac, cqe->smac, ETH_ALEN); wc->port_num = 1; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index fb01576e545d..b5a905819ecc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -410,7 +410,7 @@ struct bnxt_qplib_cqe { u16 cfa_meta; u64 wr_id; union { - __le32 immdata; + u32 immdata; u32 invrkey; }; u64 qp_handle; From 1103579d6e32a97c71ef43e045ea559bd27d4c15 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Wed, 30 Oct 2024 09:30:06 +0000 Subject: [PATCH 19/51] RDMA/efa: Report link speed according to device attributes Set port link speed and width based on max bandwidth acquired from the device instead of using constant 100 Gbps. Use a default value in case the device didn't set the field. Reviewed-by: Daniel Kranzdorf Reviewed-by: Firas Jahjah Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20241030093006.21352-1-mrgolin@amazon.com Signed-off-by: Leon Romanovsky --- .../infiniband/hw/efa/efa_admin_cmds_defs.h | 9 ++++ drivers/infiniband/hw/efa/efa_com_cmd.c | 1 + drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 45 ++++++++++++++++++- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 88a9aee7e743..fe0b6aec7839 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -718,6 +718,15 @@ struct efa_admin_feature_device_attr_desc { /* Unique global ID for an EFA device */ u64 guid; + + /* The device maximum link speed in Gbit/sec */ + u16 max_link_speed_gbps; + + /* MBZ */ + u16 reserved0; + + /* MBZ */ + u32 reserved1; }; struct efa_admin_feature_queue_attr_desc { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 9e04edb9dbda..c6b89c45fdc9 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -467,6 +467,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->max_rdma_size = resp.u.device_attr.max_rdma_size; result->device_caps = resp.u.device_attr.device_caps; result->guid = resp.u.device_attr.guid; + result->max_link_speed_gbps = resp.u.device_attr.max_link_speed_gbps; if (result->admin_api_version < 1) { ibdev_err_ratelimited( diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 25f02c0d9698..5511355b700d 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -142,6 +142,7 @@ struct efa_com_get_device_attr_result { u16 max_wr_rdma_sge; u16 max_tx_batch; u16 min_sq_depth; + u16 max_link_speed_gbps; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index ca3af866a5df..a8645a40730f 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -85,6 +85,8 @@ static const struct rdma_stat_desc efa_port_stats_descs[] = { EFA_DEFINE_PORT_STATS(EFA_STATS_STR) }; +#define EFA_DEFAULT_LINK_SPEED_GBPS 100 + #define EFA_CHUNK_PAYLOAD_SHIFT 12 #define EFA_CHUNK_PAYLOAD_SIZE BIT(EFA_CHUNK_PAYLOAD_SHIFT) #define EFA_CHUNK_PAYLOAD_PTR_SIZE 8 @@ -277,10 +279,47 @@ int efa_query_device(struct ib_device *ibdev, return 0; } +static void efa_link_gbps_to_speed_and_width(u16 gbps, + enum ib_port_speed *speed, + enum ib_port_width *width) +{ + if (gbps >= 400) { + *width = IB_WIDTH_8X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 200) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 120) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_FDR10; + } else if (gbps >= 100) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } else if (gbps >= 60) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_DDR; + } else if (gbps >= 50) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 40) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else if (gbps >= 30) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_SDR; + } else { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } +} + int efa_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { struct efa_dev *dev = to_edev(ibdev); + enum ib_port_speed link_speed; + enum ib_port_width link_width; + u16 link_gbps; props->lmc = 1; @@ -288,8 +327,10 @@ int efa_query_port(struct ib_device *ibdev, u32 port, props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; - props->active_speed = IB_SPEED_EDR; - props->active_width = IB_WIDTH_4X; + link_gbps = dev->dev_attr.max_link_speed_gbps ?: EFA_DEFAULT_LINK_SPEED_GBPS; + efa_link_gbps_to_speed_and_width(link_gbps, &link_speed, &link_width); + props->active_speed = link_speed; + props->active_width = link_width; props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); props->max_msg_sz = dev->dev_attr.mtu; From 8ab3138a9b2dcb0ddf281240cf8cba414eb1224a Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Tue, 3 Sep 2024 14:37:51 +0300 Subject: [PATCH 20/51] net/mlx5: Introduce data placement ordering bits Introduce out-of-order (OOO) data placement (DP) IFC related bits to support OOO DP QP. Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/f30e5cbb5459fd02f27f35909bb545cab346b58b.1725362773.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 96d369112bfa..2a037843b117 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1872,7 +1872,11 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_328[0x2]; u8 relaxed_ordering_read[0x1]; u8 log_max_pd[0x5]; - u8 reserved_at_330[0x5]; + u8 dp_ordering_ooo_all_ud[0x1]; + u8 dp_ordering_ooo_all_uc[0x1]; + u8 dp_ordering_ooo_all_xrc[0x1]; + u8 dp_ordering_ooo_all_dc[0x1]; + u8 dp_ordering_ooo_all_rc[0x1]; u8 pcie_reset_using_hotreset_method[0x1]; u8 pci_sync_for_fw_update_with_driver_unload[0x1]; u8 vnic_env_cnt_steering_fail[0x1]; @@ -2094,7 +2098,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; u8 migratable[0x1]; - u8 reserved_at_81[0x11]; + u8 reserved_at_81[0x7]; + u8 dp_ordering_force[0x1]; + u8 reserved_at_89[0x9]; u8 query_vuid[0x1]; u8 reserved_at_93[0x5]; u8 umr_log_entity_size_5[0x1]; @@ -3524,7 +3530,8 @@ struct mlx5_ifc_qpc_bits { u8 latency_sensitive[0x1]; u8 reserved_at_24[0x1]; u8 drain_sigerr[0x1]; - u8 reserved_at_26[0x2]; + u8 reserved_at_26[0x1]; + u8 dp_ordering_force[0x1]; u8 pd[0x18]; u8 mtu[0x3]; @@ -3597,7 +3604,8 @@ struct mlx5_ifc_qpc_bits { u8 rae[0x1]; u8 reserved_at_493[0x1]; u8 page_offset[0x6]; - u8 reserved_at_49a[0x3]; + u8 reserved_at_49a[0x2]; + u8 dp_ordering_1[0x1]; u8 cd_slave_receive[0x1]; u8 cd_slave_send[0x1]; u8 cd_master[0x1]; @@ -4507,7 +4515,8 @@ struct mlx5_ifc_dctc_bits { u8 state[0x4]; u8 reserved_at_8[0x18]; - u8 reserved_at_20[0x8]; + u8 reserved_at_20[0x7]; + u8 dp_ordering_force[0x1]; u8 user_index[0x18]; u8 reserved_at_40[0x8]; @@ -4522,7 +4531,9 @@ struct mlx5_ifc_dctc_bits { u8 latency_sensitive[0x1]; u8 rlky[0x1]; u8 free_ar[0x1]; - u8 reserved_at_73[0xd]; + u8 reserved_at_73[0x1]; + u8 dp_ordering_1[0x1]; + u8 reserved_at_75[0xb]; u8 reserved_at_80[0x8]; u8 cs_res[0x8]; From 775e6d3c8fda41083b16c26d05163fd69f029a62 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 31 Oct 2024 17:20:19 +0800 Subject: [PATCH 21/51] RDMA/rxe: Set queue pair cur_qp_state when being queried Same with commit e375b9c92985 ("RDMA/cxgb4: Set queue pair state when being queried"). The API for ib_query_qp requires the driver to set cur_qp_state on return, add the missing set. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Liu Jian Link: https://patch.msgid.link/20241031092019.2138467-1-liujian56@huawei.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_qp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index d2f7b5195c19..91d329e90308 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -775,6 +775,7 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) * Yield the processor */ spin_lock_irqsave(&qp->state_lock, flags); + attr->cur_qp_state = qp_state(qp); if (qp->attr.sq_draining) { spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); From 7363eb76b7f3b860ecfb8fcaf537e143bfd725bd Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Thu, 31 Oct 2024 19:34:40 -0700 Subject: [PATCH 22/51] RDMA/bnxt_re: Support driver specific data collection using rdma tool Allow users to dump driver specific resource details when queried through rdma tool. This supports the driver data for QP, CQ, MR and SRQ. Reviewed-by: Kalesh AP Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730428483-17841-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 141 +++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index d825eda6189e..24124c22f5e4 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -870,6 +870,139 @@ static const struct attribute_group bnxt_re_dev_attr_group = { .attrs = bnxt_re_attributes, }; +static int bnxt_re_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr) +{ + struct bnxt_qplib_hwq *mr_hwq; + struct nlattr *table_attr; + struct bnxt_re_mr *mr; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr); + mr_hwq = &mr->qplib_mr.hwq; + + if (rdma_nl_put_driver_u32(msg, "page_size", + mr_hwq->qe_ppg * mr_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_elements", mr_hwq->max_elements)) + goto err; + if (rdma_nl_put_driver_u32(msg, "element_size", mr_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "hwq", (unsigned long)mr_hwq)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "va", mr->qplib_mr.va)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq) +{ + struct bnxt_qplib_hwq *cq_hwq; + struct nlattr *table_attr; + struct bnxt_re_cq *cq; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + cq_hwq = &cq->qplib_cq.hwq; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + if (rdma_nl_put_driver_u32(msg, "cq_depth", cq_hwq->depth)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_elements", cq_hwq->max_elements)) + goto err; + if (rdma_nl_put_driver_u32(msg, "element_size", cq_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_wqe", cq->qplib_cq.max_wqe)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp) +{ + struct bnxt_qplib_qp *qplib_qp; + struct nlattr *table_attr; + struct bnxt_re_qp *qp; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + qplib_qp = &qp->qplib_qp; + + if (rdma_nl_put_driver_u32(msg, "sq_max_wqe", qplib_qp->sq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_max_sge", qplib_qp->sq.max_sge)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_wqe_size", qplib_qp->sq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_swq_start", qplib_qp->sq.swq_start)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_swq_last", qplib_qp->sq.swq_last)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_max_wqe", qplib_qp->rq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_max_sge", qplib_qp->rq.max_sge)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_wqe_size", qplib_qp->rq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_swq_start", qplib_qp->rq.swq_start)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_swq_last", qplib_qp->rq.swq_last)) + goto err; + if (rdma_nl_put_driver_u32(msg, "timeout", qplib_qp->timeout)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq) +{ + struct nlattr *table_attr; + struct bnxt_re_srq *srq; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + + if (rdma_nl_put_driver_u32_hex(msg, "wqe_size", srq->qplib_srq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "max_wqe", srq->qplib_srq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "max_sge", srq->qplib_srq.max_sge)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + static const struct ib_device_ops bnxt_re_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, @@ -928,6 +1061,13 @@ static const struct ib_device_ops bnxt_re_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), }; +static const struct ib_device_ops restrack_ops = { + .fill_res_cq_entry = bnxt_re_fill_res_cq_entry, + .fill_res_qp_entry = bnxt_re_fill_res_qp_entry, + .fill_res_mr_entry = bnxt_re_fill_res_mr_entry, + .fill_res_srq_entry = bnxt_re_fill_res_srq_entry, +}; + static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) { struct ib_device *ibdev = &rdev->ibdev; @@ -949,6 +1089,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->driver_def = bnxt_re_uapi_defs; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); + ib_set_device_ops(ibdev, &restrack_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) return ret; From 3b72946bdf19f4dffbf84b2ab0d26f09763638c4 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Thu, 31 Oct 2024 19:34:41 -0700 Subject: [PATCH 23/51] RDMA/bnxt_re: Add support for querying HW contexts Implements support for querying the hardware resource contexts. This raw data can be used for the debugging of the field issues. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730428483-17841-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 19 ++++++++++ drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 2 ++ drivers/infiniband/hw/bnxt_re/qplib_sp.c | 35 +++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 ++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 40 ++++++++++++++++++++++ 5 files changed, 98 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index bb28a1fe1430..49186a1b0ce4 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -247,4 +247,23 @@ static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev) rdev->qplib_res.pacing_data->dev_err_state = test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); } + +static inline int bnxt_re_read_context_allowed(struct bnxt_re_dev *rdev) +{ + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) || + rdev->rcfw.res->cctx->hwrm_intf_ver < HWRM_VERSION_READ_CTX) + return -EOPNOTSUPP; + return 0; +} + +#define BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P5 1088 +#define BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P5 128 +#define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P5 128 +#define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P5 192 + +#define BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P7 1088 +#define BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P7 192 +#define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 192 +#define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 192 + #endif diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 45996e60a0d0..3e723d7d0543 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -131,6 +131,8 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) #define RCFW_CMD_IS_BLOCKING 0x8000 #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL +/* HWRM version 1.10.3.18 */ +#define HWRM_VERSION_READ_CTX 0x1000A00030012 /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 4f75e7e5bcf7..ad636d708a8a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -972,3 +972,38 @@ int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, rc = bnxt_qplib_rcfw_send_message(res->rcfw, &msg); return rc; } + +int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 res_type, + u32 xid, u32 resp_size, void *resp_va) +{ + struct creq_read_context resp = {}; + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_read_context req = {}; + struct bnxt_qplib_rcfw_sbuf sbuf; + int rc; + + sbuf.size = resp_size; + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_READ_CONTEXT, sizeof(req)); + req.resp_addr = cpu_to_le64(sbuf.dma_addr); + req.resp_size = resp_size / BNXT_QPLIB_CMDQE_UNITS; + + req.xid = cpu_to_le32(xid); + req.type = res_type; + + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); + if (rc) + goto free_mem; + + memcpy(resp_va, sbuf.sb, resp_size); +free_mem: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index acd9c14a31c4..29b841ab6c0d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -352,6 +352,8 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, struct bnxt_qplib_ext_stat *estat); int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); +int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 type, u32 xid, + u32 resp_size, void *resp_va); #define BNXT_VAR_MAX_WQE 4352 #define BNXT_VAR_MAX_SLOT_ALIGN 256 diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index a7679eedbf27..d9c53731871e 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2265,6 +2265,46 @@ struct creq_set_func_resources_resp { u8 reserved48[6]; }; +/* cmdq_read_context (size:192b/24B) */ +struct cmdq_read_context { + u8 opcode; + #define CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT 0x85UL + #define CMDQ_READ_CONTEXT_OPCODE_LAST CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT + u8 cmd_size; + __le16 flags; + __le16 cookie; + u8 resp_size; + u8 reserved8; + __le64 resp_addr; + __le32 xid; + u8 type; + #define CMDQ_READ_CONTEXT_TYPE_QPC 0x0UL + #define CMDQ_READ_CONTEXT_TYPE_CQ 0x1UL + #define CMDQ_READ_CONTEXT_TYPE_MRW 0x2UL + #define CMDQ_READ_CONTEXT_TYPE_SRQ 0x3UL + #define CMDQ_READ_CONTEXT_TYPE_LAST CMDQ_READ_CONTEXT_TYPE_SRQ + u8 unused_0[3]; +}; + +/* creq_read_context (size:128b/16B) */ +struct creq_read_context { + u8 type; + #define CREQ_READ_CONTEXT_TYPE_MASK 0x3fUL + #define CREQ_READ_CONTEXT_TYPE_SFT 0 + #define CREQ_READ_CONTEXT_TYPE_QP_EVENT 0x38UL + #define CREQ_READ_CONTEXT_TYPE_LAST CREQ_READ_CONTEXT_TYPE_QP_EVENT + u8 status; + __le16 cookie; + __le32 reserved32; + u8 v; + #define CREQ_READ_CONTEXT_V 0x1UL + u8 event; + #define CREQ_READ_CONTEXT_EVENT_READ_CONTEXT 0x85UL + #define CREQ_READ_CONTEXT_EVENT_LAST CREQ_READ_CONTEXT_EVENT_READ_CONTEXT + __le16 reserved16; + __le32 reserved_32; +}; + /* cmdq_map_tc_to_cos (size:192b/24B) */ struct cmdq_map_tc_to_cos { u8 opcode; From e4bcf8eb2a0e7e50b703449f504c654b2b16976d Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Thu, 31 Oct 2024 19:34:42 -0700 Subject: [PATCH 24/51] RDMA/bnxt_re: Support raw data query for each resources Support interfaces to get the raw data for each of the resources. Use this interface to get some of the HW structures from active resources. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730428483-17841-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 118 +++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 24124c22f5e4..c227fdd06a54 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -903,6 +903,35 @@ static int bnxt_re_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr) return -EMSGSIZE; } +static int bnxt_re_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_mr *mr; + int err, len; + void *data; + + mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr); + rdev = mr->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_MRW, + mr->qplib_mr.lkey, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + static int bnxt_re_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq) { struct bnxt_qplib_hwq *cq_hwq; @@ -933,6 +962,36 @@ static int bnxt_re_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq) return -EMSGSIZE; } +static int bnxt_re_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_cq *cq; + int err, len; + void *data; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + rdev = cq->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, + CMDQ_READ_CONTEXT_TYPE_CQ, + cq->qplib_cq.id, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + static int bnxt_re_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp) { struct bnxt_qplib_qp *qplib_qp; @@ -977,6 +1036,31 @@ static int bnxt_re_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp) return -EMSGSIZE; } +static int bnxt_re_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibqp->device, ibdev); + int err, len; + void *data; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_QPC, + ibqp->qp_num, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq) { struct nlattr *table_attr; @@ -1003,6 +1087,36 @@ static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq return -EMSGSIZE; } +static int bnxt_re_fill_res_srq_entry_raw(struct sk_buff *msg, struct ib_srq *ib_srq) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + int err, len; + void *data; + + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + rdev = srq->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P5; + + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_SRQ, + srq->qplib_srq.id, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + static const struct ib_device_ops bnxt_re_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, @@ -1063,9 +1177,13 @@ static const struct ib_device_ops bnxt_re_dev_ops = { static const struct ib_device_ops restrack_ops = { .fill_res_cq_entry = bnxt_re_fill_res_cq_entry, + .fill_res_cq_entry_raw = bnxt_re_fill_res_cq_entry_raw, .fill_res_qp_entry = bnxt_re_fill_res_qp_entry, + .fill_res_qp_entry_raw = bnxt_re_fill_res_qp_entry_raw, .fill_res_mr_entry = bnxt_re_fill_res_mr_entry, + .fill_res_mr_entry_raw = bnxt_re_fill_res_mr_entry_raw, .fill_res_srq_entry = bnxt_re_fill_res_srq_entry, + .fill_res_srq_entry_raw = bnxt_re_fill_res_srq_entry_raw, }; static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) From d7d54769c042cf24e4e8aeae03ca9fb5fcb6f714 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 31 Oct 2024 19:34:43 -0700 Subject: [PATCH 25/51] RDMA/bnxt_re: Add debugfs hook in the driver Adding support for a per device debugfs folder for exporting some of the device specific debug information. Added support to get QP info for now. The same folder can be used to export other debug features in future. Signed-off-by: Saravanan Vajravel Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730428483-17841-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/Makefile | 3 +- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 + drivers/infiniband/hw/bnxt_re/debugfs.c | 138 +++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/debugfs.h | 21 ++++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 4 + drivers/infiniband/hw/bnxt_re/ib_verbs.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 13 ++- 7 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/hw/bnxt_re/debugfs.c create mode 100644 drivers/infiniband/hw/bnxt_re/debugfs.h diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile index ee9bb1be61ea..f63417d2ccc6 100644 --- a/drivers/infiniband/hw/bnxt_re/Makefile +++ b/drivers/infiniband/hw/bnxt_re/Makefile @@ -4,4 +4,5 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o bnxt_re-y := main.o ib_verbs.o \ qplib_res.o qplib_rcfw.o \ - qplib_sp.o qplib_fp.o hw_counters.o + qplib_sp.o qplib_fp.o hw_counters.o \ + debugfs.o diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 49186a1b0ce4..d1b7c2029bdc 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -221,6 +221,8 @@ struct bnxt_re_dev { struct delayed_work dbq_pacing_work; DECLARE_HASHTABLE(cq_hash, MAX_CQ_HASH_BITS); DECLARE_HASHTABLE(srq_hash, MAX_SRQ_HASH_BITS); + struct dentry *dbg_root; + struct dentry *qp_debugfs; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c new file mode 100644 index 000000000000..7c47039044ef --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright (c) 2024, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * Description: Debugfs component of the bnxt_re driver + */ + +#include +#include +#include + +#include "bnxt_ulp.h" +#include "roce_hsi.h" +#include "qplib_res.h" +#include "qplib_sp.h" +#include "qplib_fp.h" +#include "qplib_rcfw.h" +#include "bnxt_re.h" +#include "ib_verbs.h" +#include "debugfs.h" + +static struct dentry *bnxt_re_debugfs_root; + +static inline const char *bnxt_re_qp_state_str(u8 state) +{ + switch (state) { + case CMDQ_MODIFY_QP_NEW_STATE_RESET: + return "RST"; + case CMDQ_MODIFY_QP_NEW_STATE_INIT: + return "INIT"; + case CMDQ_MODIFY_QP_NEW_STATE_RTR: + return "RTR"; + case CMDQ_MODIFY_QP_NEW_STATE_RTS: + return "RTS"; + case CMDQ_MODIFY_QP_NEW_STATE_SQE: + return "SQER"; + case CMDQ_MODIFY_QP_NEW_STATE_SQD: + return "SQD"; + case CMDQ_MODIFY_QP_NEW_STATE_ERR: + return "ERR"; + default: + return "Invalid QP state"; + } +} + +static inline const char *bnxt_re_qp_type_str(u8 type) +{ + switch (type) { + case CMDQ_CREATE_QP1_TYPE_GSI: return "QP1"; + case CMDQ_CREATE_QP_TYPE_GSI: return "QP1"; + case CMDQ_CREATE_QP_TYPE_RC: return "RC"; + case CMDQ_CREATE_QP_TYPE_UD: return "UD"; + case CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE: return "RAW_ETHERTYPE"; + default: return "Invalid transport type"; + } +} + +static ssize_t qp_info_read(struct file *filep, + char __user *buffer, + size_t count, loff_t *ppos) +{ + struct bnxt_re_qp *qp = filep->private_data; + char *buf; + int len; + + if (*ppos) + return 0; + + buf = kasprintf(GFP_KERNEL, + "QPN\t\t: %d\n" + "transport\t: %s\n" + "state\t\t: %s\n" + "mtu\t\t: %d\n" + "timeout\t\t: %d\n" + "remote QPN\t: %d\n", + qp->qplib_qp.id, + bnxt_re_qp_type_str(qp->qplib_qp.type), + bnxt_re_qp_state_str(qp->qplib_qp.state), + qp->qplib_qp.mtu, + qp->qplib_qp.timeout, + qp->qplib_qp.dest_qpn); + if (!buf) + return -ENOMEM; + if (count < strlen(buf)) { + kfree(buf); + return -ENOSPC; + } + len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); + kfree(buf); + return len; +} + +static const struct file_operations debugfs_qp_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = qp_info_read, +}; + +void bnxt_re_debug_add_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + char resn[32]; + + sprintf(resn, "0x%x", qp->qplib_qp.id); + qp->dentry = debugfs_create_file(resn, 0400, rdev->qp_debugfs, qp, &debugfs_qp_fops); +} + +void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + debugfs_remove(qp->dentry); +} + +void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->en_dev->pdev; + + rdev->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), bnxt_re_debugfs_root); + + rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root); +} + +void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->qp_debugfs); + + debugfs_remove_recursive(rdev->dbg_root); + rdev->dbg_root = NULL; +} + +void bnxt_re_register_debugfs(void) +{ + bnxt_re_debugfs_root = debugfs_create_dir("bnxt_re", NULL); +} + +void bnxt_re_unregister_debugfs(void) +{ + debugfs_remove(bnxt_re_debugfs_root); +} diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h new file mode 100644 index 000000000000..cd3be0a9ec7e --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright (c) 2024, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * Description: Debugfs header + */ + +#ifndef __BNXT_RE_DEBUGFS__ +#define __BNXT_RE_DEBUGFS__ + +void bnxt_re_debug_add_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp); +void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp); + +void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev); +void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev); + +void bnxt_re_register_debugfs(void); +void bnxt_re_unregister_debugfs(void); + +#endif diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 09e61553e43f..9a188ccd4ead 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -62,6 +62,7 @@ #include "bnxt_re.h" #include "ib_verbs.h" +#include "debugfs.h" #include #include @@ -997,6 +998,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) else if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD) atomic_dec(&rdev->stats.res.ud_qp_count); + bnxt_re_debug_rem_qpinfo(rdev, qp); + ib_umem_release(qp->rumem); ib_umem_release(qp->sumem); @@ -1676,6 +1679,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, if (active_qps > rdev->stats.res.ud_qp_watermark) rdev->stats.res.ud_qp_watermark = active_qps; } + bnxt_re_debug_add_qpinfo(rdev, qp); return 0; qp_destroy: diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 83a584e2a16e..ac59f1d73b15 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -95,6 +95,7 @@ struct bnxt_re_qp { struct ib_ud_header qp1_hdr; struct bnxt_re_cq *scq; struct bnxt_re_cq *rcq; + struct dentry *dentry; }; struct bnxt_re_cq { diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index c227fdd06a54..4127227a9447 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -67,6 +67,7 @@ #include #include "bnxt.h" #include "hw_counters.h" +#include "debugfs.h" static char version[] = BNXT_RE_DESC "\n"; @@ -1863,6 +1864,8 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) u8 type; int rc; + bnxt_re_debugfs_rem_pdev(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) cancel_delayed_work_sync(&rdev->worker); @@ -2063,6 +2066,8 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) hash_init(rdev->srq_hash); + bnxt_re_debugfs_add_pdev(rdev); + return 0; free_sctx: bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); @@ -2389,18 +2394,24 @@ static int __init bnxt_re_mod_init(void) int rc; pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version); + bnxt_re_register_debugfs(); + rc = auxiliary_driver_register(&bnxt_re_driver); if (rc) { pr_err("%s: Failed to register auxiliary driver\n", ROCE_DRV_MODULE_NAME); - return rc; + goto err_debug; } return 0; +err_debug: + bnxt_re_unregister_debugfs(); + return rc; } static void __exit bnxt_re_mod_exit(void) { auxiliary_driver_unregister(&bnxt_re_driver); + bnxt_re_unregister_debugfs(); } module_init(bnxt_re_mod_init); From 8b36f7c3c6618dc97697a6a20a13b29651f68ab6 Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Tue, 3 Sep 2024 14:37:52 +0300 Subject: [PATCH 26/51] RDMA/mlx5: Support OOO RX WQE consumption Support QP with out-of-order (OOO) capabilities enabled. This allows WRs on the receiver side of the QP to be consumed OOO, permitting the sender side to transmit messages without guaranteeing arrival order on the receiver side. When enabled, the completion ordering of WRs remains in-order, regardless of the Receive WRs consumption order. RDMA Read and RDMA Atomic operations on the responder side continue to be executed in-order, while the ordering of data placement for RDMA Write and Send operations is not guaranteed. Atomic operations larger than 8 bytes are currently not supported. Therefore, when this feature is enabled, the created QP restricts its atomic support to 8 bytes at most. In addition, when querying the device, a new flag is returned in response to indicate that the Kernel supports OOO QP. Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/06ac609a5f358c8fb0a090d22c61a2f9329d82e6.1725362773.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 8 +++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 51 +++++++++++++++++++++++++--- include/uapi/rdma/mlx5-abi.h | 5 +++ 4 files changed, 60 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4999239c8f41..b4476df96ed5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1182,6 +1182,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; + + if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) && + (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc))) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP; } if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 23fd72f7f63d..ed4eaaa7ac71 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -521,6 +521,7 @@ struct mlx5_ib_qp { struct mlx5_bf bf; u8 has_rq:1; u8 is_rss:1; + u8 is_ooo_rq:1; /* only for user space QPs. For kernel * we have it from the bf object diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e39b1a101e97..837b662b41de 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1960,7 +1960,7 @@ static int atomic_size_to_mode(int size_mask) } static int get_atomic_mode(struct mlx5_ib_dev *dev, - enum ib_qp_type qp_type) + struct mlx5_ib_qp *qp) { u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); @@ -1970,7 +1970,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, if (!atomic) return -EOPNOTSUPP; - if (qp_type == MLX5_IB_QPT_DCT) + if (qp->type == MLX5_IB_QPT_DCT) atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); else atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); @@ -1984,6 +1984,10 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + /* OOO DP QPs do not support larger than 8-Bytes atomic operations */ + if (atomic_mode > MLX5_ATOMIC_MODE_8B && qp->is_ooo_rq) + atomic_mode = MLX5_ATOMIC_MODE_8B; + return atomic_mode; } @@ -2839,6 +2843,29 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } +static bool get_dp_ooo_cap(struct mlx5_core_dev *mdev, enum ib_qp_type qp_type) +{ + if (!MLX5_CAP_GEN_2(mdev, dp_ordering_force)) + return false; + + switch (qp_type) { + case IB_QPT_RC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc); + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc); + case IB_QPT_UC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc); + case IB_QPT_UD: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud); + case MLX5_IB_QPT_DCI: + case MLX5_IB_QPT_DCT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc); + default: + return false; + } +} + static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, bool cond, struct mlx5_ib_qp *qp) { @@ -3365,7 +3392,7 @@ static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, qp->type); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -4316,6 +4343,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) MLX5_SET(qpc, qpc, deth_sqpn, 1); + if (qp->is_ooo_rq && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + MLX5_SET(qpc, qpc, dp_ordering_1, 1); + MLX5_SET(qpc, qpc, dp_ordering_force, 1); + } + mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -4531,7 +4563,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -4573,6 +4605,10 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); + if (qp->is_ooo_rq) { + MLX5_SET(dctc, dctc, dp_ordering_1, 1); + MLX5_SET(dctc, dctc, dp_ordering_force, 1); + } err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, @@ -4676,11 +4712,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, min(udata->inlen, sizeof(ucmd)))) return -EFAULT; - if (ucmd.comp_mask || + if (ucmd.comp_mask & ~MLX5_IB_MODIFY_QP_OOO_DP || memchr_inv(&ucmd.burst_info.reserved, 0, sizeof(ucmd.burst_info.reserved))) return -EOPNOTSUPP; + if (ucmd.comp_mask & MLX5_IB_MODIFY_QP_OOO_DP) { + if (!get_dp_ooo_cap(dev->mdev, qp->type)) + return -EOPNOTSUPP; + qp->is_ooo_rq = 1; + } } if (qp->type == IB_QPT_GSI) diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index d4f6a36dffb0..8a6ad6c6841c 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -252,6 +252,7 @@ enum mlx5_ib_query_dev_resp_flags { MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD = 1 << 1, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2, MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT = 1 << 3, + MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP = 1 << 4, }; enum mlx5_ib_tunnel_offloads { @@ -439,6 +440,10 @@ struct mlx5_ib_burst_info { __u16 reserved; }; +enum mlx5_ib_modify_qp_mask { + MLX5_IB_MODIFY_QP_OOO_DP = 1 << 0, +}; + struct mlx5_ib_modify_qp { __u32 comp_mask; struct mlx5_ib_burst_info burst_info; From eb3d354efb39576c86c1e659807c57c557f2f68a Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 31 Oct 2024 11:48:14 +0200 Subject: [PATCH 27/51] RDMA/mlx5: Support querying per-plane IB PortCounters On a SMI device, set requested plane_num when querying PPCNT register with the PortCounters Attribute group. Signed-off-by: Mark Zhang Reviewed-by: Maher Sanalla Link: https://patch.msgid.link/828d57444a0a41042556bb0a4394ecf2fcaed639.1730368052.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mad.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 1b6c5e37d169..2453ae4384a7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -278,7 +278,13 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = query_ib_ppcnt(mdev, mdev_port_num, 0, out_cnt, sz, 0); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) + err = query_ib_ppcnt(mdev, mdev_port_num, port_num, + out_cnt, sz, 0); + else + err = query_ib_ppcnt(mdev, mdev_port_num, 0, + out_cnt, sz, 0); + if (!err) pma_cnt_assign(pma_cnt, out_cnt); } From 6d9c7b272966f13ebbf39687620f395d97f4d15d Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 31 Oct 2024 15:36:50 +0200 Subject: [PATCH 28/51] RDMA/mlx5: Call dev_put() after the blocking notifier Move dev_put() call to occur directly after the blocking notifier, instead of within the event handler. Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/342ff94b3dcbb07da1c7dab862a73933d604b717.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b4476df96ed5..5f7fe32b9051 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3242,7 +3242,6 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data) } err = ib_device_set_netdev(&dev->ib_dev, ndev, portnum + 1); - dev_put(ndev); if (err) return err; /* Rescan gids after new netdev assignment */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 8577db3308cc..d661267d98ff 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -516,6 +516,7 @@ void mlx5_modify_lag(struct mlx5_lag *ldev, blocking_notifier_call_chain(&dev0->priv.lag_nh, MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, ndev); + dev_put(ndev); } } From af7a35bf6c36a77624d3abe46b3830b7c2a5f20c Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 31 Oct 2024 15:36:51 +0200 Subject: [PATCH 29/51] RDMA/core: Implement RoCE GID port rescan and export delete function rdma_roce_rescan_port() scans all network devices in the system and adds the gids if relevant to the RoCE device port. When not in bonding mode it adds the GIDs of the netdevice in this port. When in bonding mode it adds the GIDs of both the port's netdevice and the bond master netdevice. Export roce_del_all_netdev_gids(), which removes all GIDs associated with a specific netdevice for a given port. Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/674d498da4637a1503ff1367e28bd09ff942fd5e.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/roce_gid_mgmt.c | 30 +++++++++++++++++++++---- include/rdma/ib_verbs.h | 3 +++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index d5131b3ba8ab..a9f2c6b1b29e 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -515,6 +515,27 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev) } EXPORT_SYMBOL(rdma_roce_rescan_device); +/** + * rdma_roce_rescan_port - Rescan all of the network devices in the system + * and add their gids if relevant to the port of the RoCE device. + * + * @ib_dev: IB device + * @port: Port number + */ +void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) +{ + struct net_device *ndev = NULL; + + if (rdma_protocol_roce(ib_dev, port)) { + ndev = ib_device_get_netdev(ib_dev, port); + if (!ndev) + return; + enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); + dev_put(ndev); + } +} +EXPORT_SYMBOL(rdma_roce_rescan_port); + static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, @@ -575,16 +596,17 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, } } -static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, - struct net_device *event_ndev) +void roce_del_all_netdev_gids(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) { - ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); + ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } +EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { - handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); + handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9cb8b5fe7eee..67551133b522 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4734,6 +4734,9 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) * @device: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); +void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); +void roce_del_all_netdev_gids(struct ib_device *ib_dev, + u32 port, struct net_device *ndev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); From 0bd2c61df95321e1ec123017cd8657360d15a24e Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 31 Oct 2024 15:36:52 +0200 Subject: [PATCH 30/51] RDMA/mlx5: Ensure active slave attachment to the bond IB device Fix a race condition when creating a lag bond in active backup mode where after the bond creation the backup slave was attached to the IB device, instead of the active slave. This caused stale entries in the GID table, as the gid updating mechanism relies on ib_device_get_netdev(), which would return the backup slave. Send an MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE event when activating the lag, additionally to when modifying the lag. This ensures that eventually the active netdevice is stored in the bond IB device. When handling this event remove the GIDs of the previously attached netdevice in this port and rescan the GIDs of the newly attached netdevice. This ensures that eventually the active slave netdevice is correctly stored in the IB device port. While there might be a brief moment where the backup slave GIDs appear in the GID table, it will eventually stabilize with the correct GIDs (of the bond and the active slave). Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/91fc2cb24f63add266a528c1c702668a80416d9f.1730381292.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 28 ++++++++++++------- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 11 ++++++++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5f7fe32b9051..5038c52b79aa 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3219,12 +3219,14 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data) struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, lag_events); struct mlx5_core_dev *mdev = dev->mdev; + struct ib_device *ibdev = &dev->ib_dev; + struct net_device *old_ndev = NULL; struct mlx5_ib_port *port; struct net_device *ndev; - int i, err; - int portnum; + u32 portnum = 0; + int ret = 0; + int i; - portnum = 0; switch (event) { case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: ndev = data; @@ -3240,18 +3242,24 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data) } } } - err = ib_device_set_netdev(&dev->ib_dev, ndev, - portnum + 1); - if (err) - return err; - /* Rescan gids after new netdev assignment */ - rdma_roce_rescan_device(&dev->ib_dev); + old_ndev = ib_device_get_netdev(ibdev, portnum + 1); + ret = ib_device_set_netdev(ibdev, ndev, portnum + 1); + if (ret) + goto out; + + if (old_ndev) + roce_del_all_netdev_gids(ibdev, portnum + 1, + old_ndev); + rdma_roce_rescan_port(ibdev, portnum + 1); } break; default: return NOTIFY_DONE; } - return NOTIFY_OK; + +out: + dev_put(old_ndev); + return notifier_from_errno(ret); } static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index d661267d98ff..7f68468c2e75 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -919,6 +919,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct lag_tracker tracker = { }; + struct net_device *ndev; bool do_bond, roce_lag; int err; int i; @@ -982,6 +983,16 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) return; } } + if (tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + ndev = mlx5_lag_active_backup_get_netdev(dev0); + /** Only sriov and roce lag should have tracker->TX_type + * set so no need to check the mode + */ + blocking_notifier_call_chain(&dev0->priv.lag_nh, + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, + ndev); + dev_put(ndev); + } } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { From dc6be4418a1144cce422093cde0245c76cdcaff0 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 31 Oct 2024 13:22:51 +0200 Subject: [PATCH 31/51] RDMA/core: Add device ufile cleanup operation Add a driver operation to allow preemptive cleanup of ufile HW resources before the standard ufile cleanup flow begins. Thus, expediting the final cleanup phase which leads to fast teardown overall. This allows the use of driver specific clean up procedures to make the cleanup process more efficient. Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/cabe00d75132b5732cb515944e3c500a01fb0b4a.1730373303.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/rdma_core.c | 7 ++++++- include/rdma/ib_verbs.h | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index e029401b5680..de4ffc9eb37d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2759,6 +2759,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); + SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 29b1ab1d5f93..02ef09e77bf8 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -880,9 +880,14 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { + struct uverbs_attr_bundle attrs = { .ufile = ufile }; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; struct ib_uobject *obj, *next_obj; int ret = -EINVAL; - struct uverbs_attr_bundle attrs = { .ufile = ufile }; + + if (ib_dev->ops.ufile_hw_cleanup) + ib_dev->ops.ufile_hw_cleanup(ufile); /* * This shouldn't run while executing other commands on this diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 67551133b522..3417636da960 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2675,6 +2675,12 @@ struct ib_device_ops { */ void (*del_sub_dev)(struct ib_device *sub_dev); + /** + * ufile_cleanup - Attempt to cleanup ubojects HW resources inside + * the ufile. + */ + void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); From 27ed2f00807c2328c99751f9500ce6478f16cf7b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 31 Oct 2024 13:22:52 +0200 Subject: [PATCH 32/51] RDMA/core: Move ib_uverbs_file struct to uverbs_types.h In light of the previous commit, make the ib_uverbs_file accessible to drivers by moving its definition to uverbs_types.h, to allow drivers to freely access the struct argument and create a personalized cleanup flow. For the same reason expose uverbs_try_lock_object function to allow driver to safely access the uverbs objects. Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/29b718e0dca35daa5f496320a39284fc1f5a1722.1730373303.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/rdma_core.c | 5 +++-- drivers/infiniband/core/uverbs.h | 31 --------------------------- include/rdma/uverbs_types.h | 33 +++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 02ef09e77bf8..90c177edf9b0 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -58,8 +58,8 @@ void uverbs_uobject_put(struct ib_uobject *uobject) } EXPORT_SYMBOL(uverbs_uobject_put); -static int uverbs_try_lock_object(struct ib_uobject *uobj, - enum rdma_lookup_mode mode) +int uverbs_try_lock_object(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { /* * When a shared access is required, we use a positive counter. Each @@ -84,6 +84,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, } return 0; } +EXPORT_SYMBOL(uverbs_try_lock_object); static void assert_uverbs_usecnt(struct ib_uobject *uobj, enum rdma_lookup_mode mode) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index dfd2e5a86e6f..797e2fcc8072 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -133,37 +133,6 @@ struct ib_uverbs_completion_event_file { struct ib_uverbs_event_queue ev_queue; }; -struct ib_uverbs_file { - struct kref ref; - struct ib_uverbs_device *device; - struct mutex ucontext_lock; - /* - * ucontext must be accessed via ib_uverbs_get_ucontext() or with - * ucontext_lock held - */ - struct ib_ucontext *ucontext; - struct ib_uverbs_async_event_file *default_async_file; - struct list_head list; - - /* - * To access the uobjects list hw_destroy_rwsem must be held for write - * OR hw_destroy_rwsem held for read AND uobjects_lock held. - * hw_destroy_rwsem should be called across any destruction of the HW - * object of an associated uobject. - */ - struct rw_semaphore hw_destroy_rwsem; - spinlock_t uobjects_lock; - struct list_head uobjects; - - struct mutex umap_lock; - struct list_head umaps; - struct page *disassociate_page; - - struct xarray idr; - - struct mutex disassociation_lock; -}; - struct ib_uverbs_event { union { struct ib_uverbs_async_event_desc async; diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index ccd11631c167..26ba919ac245 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -134,6 +134,8 @@ static inline void uverbs_uobject_get(struct ib_uobject *uobject) } void uverbs_uobject_put(struct ib_uobject *uobject); +int uverbs_try_lock_object(struct ib_uobject *uobj, enum rdma_lookup_mode mode); + struct uverbs_obj_fd_type { /* * In fd based objects, uverbs_obj_type_ops points to generic @@ -150,6 +152,37 @@ struct uverbs_obj_fd_type { int flags; }; +struct ib_uverbs_file { + struct kref ref; + struct ib_uverbs_device *device; + struct mutex ucontext_lock; + /* + * ucontext must be accessed via ib_uverbs_get_ucontext() or with + * ucontext_lock held + */ + struct ib_ucontext *ucontext; + struct ib_uverbs_async_event_file *default_async_file; + struct list_head list; + + /* + * To access the uobjects list hw_destroy_rwsem must be held for write + * OR hw_destroy_rwsem held for read AND uobjects_lock held. + * hw_destroy_rwsem should be called across any destruction of the HW + * object of an associated uobject. + */ + struct rw_semaphore hw_destroy_rwsem; + spinlock_t uobjects_lock; + struct list_head uobjects; + + struct mutex umap_lock; + struct list_head umaps; + struct page *disassociate_page; + + struct xarray idr; + + struct mutex disassociation_lock; +}; + extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; int uverbs_uobject_fd_release(struct inode *inode, struct file *filp); From 7c891a4dbcc1f6c69297fcf89f1553e64d282643 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 31 Oct 2024 13:22:53 +0200 Subject: [PATCH 33/51] RDMA/mlx5: Add implementation for ufile_hw_cleanup device operation Implement the device API for ufile_hw_cleanup operation, which iterates over the ufile uobjects lists, and attempts to destroy DevX QPs, by issuing up to 8 commands in parallel. This function is responsible only for cleaning the FW resources of the QP, and doesn't necessarily cleanup all of its resources. Hence the normal serialized cleanup flow is still executed after it in __uverbs_cleanup_ufile() to cleanup the remaining resources and handle the cleanup of SW objects. In order to avoid double cleanup for the FW resources, new DevX flag was added DEVX_OBJ_FLAGS_HW_FREED, which marks the object's FW resources as already freed. Since QP destruction is the most time-consuming operation in FW, parallelizing it reduces the cleanup time of applications that use DevX QPs. Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/2f82675d0412542cba1c47a6b86f589521ae41e1.1730373303.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/devx.c | 93 ++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/devx.h | 4 ++ drivers/infiniband/hw/mlx5/main.c | 1 + 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 69999d8d24f3..4186884c66e1 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -27,6 +27,19 @@ enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, DEVX_OBJ_FLAGS_DCT = 1 << 1, DEVX_OBJ_FLAGS_CQ = 1 << 2, + DEVX_OBJ_FLAGS_HW_FREED = 1 << 3, +}; + +#define MAX_ASYNC_CMDS 8 + +struct mlx5_async_cmd { + struct ib_uobject *uobject; + void *in; + int in_size; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + struct mlx5_async_work cb_work; + struct completion comp; }; struct devx_async_data { @@ -1405,7 +1418,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, */ mlx5r_deref_wait_odp_mkey(&obj->mkey); - if (obj->flags & DEVX_OBJ_FLAGS_DCT) + if (obj->flags & DEVX_OBJ_FLAGS_HW_FREED) + ret = 0; + else if (obj->flags & DEVX_OBJ_FLAGS_DCT) ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); @@ -2595,6 +2610,82 @@ void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) } } +static void devx_async_destroy_cb(int status, struct mlx5_async_work *context) +{ + struct mlx5_async_cmd *devx_out = container_of(context, + struct mlx5_async_cmd, cb_work); + struct devx_obj *obj = devx_out->uobject->object; + + if (!status) + obj->flags |= DEVX_OBJ_FLAGS_HW_FREED; + + complete(&devx_out->comp); +} + +static void devx_async_destroy(struct mlx5_ib_dev *dev, + struct mlx5_async_cmd *cmd) +{ + init_completion(&cmd->comp); + cmd->err = mlx5_cmd_exec_cb(&dev->async_ctx, cmd->in, cmd->in_size, + &cmd->out, sizeof(cmd->out), + devx_async_destroy_cb, &cmd->cb_work); +} + +static void devx_wait_async_destroy(struct mlx5_async_cmd *cmd) +{ + if (!cmd->err) + wait_for_completion(&cmd->comp); + atomic_set(&cmd->uobject->usecnt, 0); +} + +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ + struct mlx5_async_cmd async_cmd[MAX_ASYNC_CMDS]; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *device = ucontext->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_uobject *uobject; + struct devx_obj *obj; + int head = 0; + int tail = 0; + + list_for_each_entry(uobject, &ufile->uobjects, list) { + WARN_ON(uverbs_try_lock_object(uobject, UVERBS_LOOKUP_WRITE)); + + /* + * Currently we only support QP destruction, if other objects + * are to be destroyed need to add type synchronization to the + * cleanup algorithm and handle pre/post FW cleanup for the + * new types if needed. + */ + if (uobj_get_object_id(uobject) != MLX5_IB_OBJECT_DEVX_OBJ || + (get_dec_obj_type(uobject->object, MLX5_EVENT_TYPE_MAX) != + MLX5_OBJ_TYPE_QP)) { + atomic_set(&uobject->usecnt, 0); + continue; + } + + obj = uobject->object; + + async_cmd[tail % MAX_ASYNC_CMDS].in = obj->dinbox; + async_cmd[tail % MAX_ASYNC_CMDS].in_size = obj->dinlen; + async_cmd[tail % MAX_ASYNC_CMDS].uobject = uobject; + + devx_async_destroy(dev, &async_cmd[tail % MAX_ASYNC_CMDS]); + tail++; + + if (tail - head == MAX_ASYNC_CMDS) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } + } + + while (head != tail) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } +} + static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h index ee2213275fd6..1344bf4c9d21 100644 --- a/drivers/infiniband/hw/mlx5/devx.h +++ b/drivers/infiniband/hw/mlx5/devx.h @@ -28,6 +28,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile); #else static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) { @@ -41,5 +42,8 @@ static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) { } +static inline void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ +} #endif #endif /* _MLX5_IB_DEVX_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5038c52b79aa..65da5df05d02 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4149,6 +4149,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, + .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), From 7566752e4d7d7fc0186531aa800068a7243f95c1 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 31 Oct 2024 11:31:14 +0200 Subject: [PATCH 34/51] RDMA/nldev: Add IB device and net device rename events Implement event sending for IB device rename and IB device port associated netdevice rename. In iproute2, rdma monitor displays the IB device name, port and the netdevice name when displaying event info. Since users can modiy these names, we track and notify on renaming events. Note: In order to receive netdevice rename events, drivers must use the ib_device_set_netdev() API when attaching net devices to IB devices. $ rdma monitor $ rmmod mlx5_ib [UNREGISTER] dev 1 rocep8s0f1 [UNREGISTER] dev 0 rocep8s0f0 $ modprobe mlx5_ib [REGISTER] dev 2 mlx5_0 [NETDEV_ATTACH] dev 2 mlx5_0 port 1 netdev 4 eth2 [REGISTER] dev 3 mlx5_1 [NETDEV_ATTACH] dev 3 mlx5_1 port 1 netdev 5 eth3 [RENAME] dev 2 rocep8s0f0 [RENAME] dev 3 rocep8s0f1 $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev [UNREGISTER] dev 2 rocep8s0f0 [REGISTER] dev 4 mlx5_0 [NETDEV_ATTACH] dev 4 mlx5_0 port 30 netdev 4 eth2 [RENAME] dev 4 rdmap8s0f0 $ echo 4 > /sys/class/net/eth2/device/sriov_numvfs [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 2 netdev 7 eth4 [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 3 netdev 8 eth5 [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 4 netdev 9 eth6 [NETDEV_ATTACH] dev 4 rdmap8s0f0 port 5 netdev 10 eth7 [REGISTER] dev 5 mlx5_0 [NETDEV_ATTACH] dev 5 mlx5_0 port 1 netdev 11 eth8 [REGISTER] dev 6 mlx5_1 [NETDEV_ATTACH] dev 6 mlx5_1 port 1 netdev 12 eth9 [RENAME] dev 5 rocep8s0f0v0 [RENAME] dev 6 rocep8s0f0v1 [REGISTER] dev 7 mlx5_0 [NETDEV_ATTACH] dev 7 mlx5_0 port 1 netdev 13 eth10 [RENAME] dev 7 rocep8s0f0v2 [REGISTER] dev 8 mlx5_0 [NETDEV_ATTACH] dev 8 mlx5_0 port 1 netdev 14 eth11 [RENAME] dev 8 rocep8s0f0v3 $ ip link set eth2 name myeth2 [NETDEV_RENAME] netdev 4 myeth2 $ ip link set eth1 name myeth1 ** no events received, because eth1 is not attached to an IB device ** Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/093c978ef2766fd3ab4ff8798eeb68f2f11582f6.1730367038.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 38 ++++++++++++++++++++++++++++++ drivers/infiniband/core/nldev.c | 40 ++++++++++++++++++++++++++++++-- include/uapi/rdma/rdma_netlink.h | 2 ++ 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index de4ffc9eb37d..ca9b956c034d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -437,6 +437,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); + rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } @@ -2853,6 +2854,40 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { }, }; +static int ib_netdevice_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct net_device *ib_ndev; + struct ib_device *ibdev; + u32 port; + + switch (event) { + case NETDEV_CHANGENAME: + ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); + if (!ibdev) + return NOTIFY_DONE; + + rdma_for_each_port(ibdev, port) { + ib_ndev = ib_device_get_netdev(ibdev, port); + if (ndev == ib_ndev) + rdma_nl_notify_event(ibdev, port, + RDMA_NETDEV_RENAME_EVENT); + dev_put(ib_ndev); + } + ib_device_put(ibdev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block nb_netdevice = { + .notifier_call = ib_netdevice_event, +}; + static int __init ib_core_init(void) { int ret = -ENOMEM; @@ -2924,6 +2959,8 @@ static int __init ib_core_init(void) goto err_parent; } + register_netdevice_notifier(&nb_netdevice); + return 0; err_parent: @@ -2953,6 +2990,7 @@ static int __init ib_core_init(void) static void __exit ib_core_cleanup(void) { + unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 39f89a4b8649..0034c495cbbe 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2729,6 +2729,25 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, }; +static int fill_mon_netdev_rename(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (!netdev || !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); +out: + dev_put(netdev); + return ret; +} + static int fill_mon_netdev_association(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) @@ -2793,6 +2812,18 @@ static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, "Failed to send RDMA monitor netdev detach event: port %d\n", port_num); break; + case RDMA_RENAME_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor rename device event\n"); + break; + + case RDMA_NETDEV_RENAME_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; default: break; } @@ -2820,14 +2851,19 @@ int rdma_nl_notify_event(struct ib_device *device, u32 port_num, switch (type) { case RDMA_REGISTER_EVENT: case RDMA_UNREGISTER_EVENT: + case RDMA_RENAME_EVENT: ret = fill_nldev_handle(skb, device); if (ret) goto err_free; break; case RDMA_NETDEV_ATTACH_EVENT: case RDMA_NETDEV_DETACH_EVENT: - ret = fill_mon_netdev_association(skb, device, - port_num, net); + ret = fill_mon_netdev_association(skb, device, port_num, net); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_RENAME_EVENT: + ret = fill_mon_netdev_rename(skb, device, port_num, net); if (ret) goto err_free; break; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 39be09c0ffbb..9f9cf20c1cd8 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -638,6 +638,8 @@ enum rdma_nl_notify_event_type { RDMA_UNREGISTER_EVENT, RDMA_NETDEV_ATTACH_EVENT, RDMA_NETDEV_DETACH_EVENT, + RDMA_RENAME_EVENT, + RDMA_NETDEV_RENAME_EVENT, }; #endif /* _UAPI_RDMA_NETLINK_H */ From 5dbcb1c1900f45182b5651c89257c272f1f3ead7 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Fri, 8 Nov 2024 15:57:42 +0800 Subject: [PATCH 35/51] RDMA/hns: Fix out-of-order issue of requester when setting FENCE The FENCE indicator in hns WQE doesn't ensure that response data from a previous Read/Atomic operation has been written to the requester's memory before the subsequent Send/Write operation is processed. This may result in the subsequent Send/Write operation accessing the original data in memory instead of the expected response data. Unlike FENCE, the SO (Strong Order) indicator blocks the subsequent operation until the previous response data is written to memory and a bresp is returned. Set the SO indicator instead of FENCE to maintain strict order. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241108075743.2652258-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d1c075fb0ad8..707e96ce222c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -575,7 +575,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, if (WARN_ON(ret)) return ret; - hr_reg_write(rc_sq_wqe, RC_SEND_WQE_FENCE, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SO, (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SE, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 3b3c6259ace0..dedb1853e193 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -924,6 +924,7 @@ struct hns_roce_v2_rc_send_wqe { #define RC_SEND_WQE_OWNER RC_SEND_WQE_FIELD_LOC(7, 7) #define RC_SEND_WQE_CQE RC_SEND_WQE_FIELD_LOC(8, 8) #define RC_SEND_WQE_FENCE RC_SEND_WQE_FIELD_LOC(9, 9) +#define RC_SEND_WQE_SO RC_SEND_WQE_FIELD_LOC(10, 10) #define RC_SEND_WQE_SE RC_SEND_WQE_FIELD_LOC(11, 11) #define RC_SEND_WQE_INLINE RC_SEND_WQE_FIELD_LOC(12, 12) #define RC_SEND_WQE_WQE_INDEX RC_SEND_WQE_FIELD_LOC(30, 15) From 6b526d17eed850352d880b93b9bf20b93006bd92 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Fri, 8 Nov 2024 15:57:43 +0800 Subject: [PATCH 36/51] RDMA/hns: Fix NULL pointer derefernce in hns_roce_map_mr_sg() ib_map_mr_sg() allows ULPs to specify NULL as the sg_offset argument. The driver needs to check whether it is a NULL pointer before dereferencing it. Fixes: d387d4b54eb8 ("RDMA/hns: Fix missing pagesize and alignment check in FRMR") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241108075743.2652258-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_mr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index b3f4327d0e64..bf30b3a65a9b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -435,15 +435,16 @@ static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr) } int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, - unsigned int *sg_offset) + unsigned int *sg_offset_p) { + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); struct hns_roce_mtr *mtr = &mr->pbl_mtr; int ret, sg_num = 0; - if (!IS_ALIGNED(*sg_offset, HNS_ROCE_FRMR_ALIGN_SIZE) || + if (!IS_ALIGNED(sg_offset, HNS_ROCE_FRMR_ALIGN_SIZE) || ibmr->page_size < HNS_HW_PAGE_SIZE || ibmr->page_size > HNS_HW_MAX_PAGE_SIZE) return sg_num; @@ -454,7 +455,7 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, if (!mr->page_list) return sg_num; - sg_num = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); + sg_num = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset_p, hns_roce_set_page); if (sg_num < 1) { ibdev_err(ibdev, "failed to store sg pages %u %u, cnt = %d.\n", mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, sg_num); From 53371c5c218f9fd5ec18843762a65c686040c574 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Wed, 6 Nov 2024 00:44:34 -0800 Subject: [PATCH 37/51] bnxt_en: Add support for RoCE sriov configuration During driver load, PF RDMA driver provisions resources to the RDMA VFs. This logic takes into consideration of the total number of VFs supported on the PF while allocating resources. Firmware now advertises a capability where NIC driver can allocate resources for RDMA VFs when the user actually creates a VF. So this resource distribution can be based on the number of active VFs. This patch adds the support to check for the firmware capability and follow the new RDMA VF resource allocation strategy. The current logic in the RDMA driver will be removed for the newer Firmware versions in a subsequent patch in this series. Signed-off-by: Vikas Gupta Reviewed-by: Selvin Xavier Reviewed-by: Pavan Chebbi Reviewed-by: Michael Chan Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730882676-24434-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 +++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 6 +++ .../net/ethernet/broadcom/bnxt/bnxt_sriov.c | 53 +++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6e422e24750a..70230c53d7b8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8151,6 +8151,9 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp) if (flags & FUNC_QCFG_RESP_FLAGS_RING_MONITOR_ENABLED) bp->fw_cap |= BNXT_FW_CAP_RING_MONITOR; + if (flags & FUNC_QCFG_RESP_FLAGS_ENABLE_RDMA_SRIOV) + bp->fw_cap |= BNXT_FW_CAP_ENABLE_RDMA_SRIOV; + switch (resp->port_partition_type) { case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0: case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_5: @@ -9177,6 +9180,9 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp) bp->flags |= BNXT_FLAG_UDP_GSO_CAP; if (flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_TX_PKT_TS_CMPL_SUPPORTED) bp->fw_cap |= BNXT_FW_CAP_TX_TS_CMP; + if (BNXT_PF(bp) && + (flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_ROCE_VF_RESOURCE_MGMT_SUPPORTED)) + bp->fw_cap |= BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED; bp->tx_push_thresh = 0; if ((flags & FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED) && diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 69231e85140b..2da6c7ba5238 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2406,6 +2406,8 @@ struct bnxt { #define BNXT_FW_CAP_DCBX_AGENT BIT_ULL(2) #define BNXT_FW_CAP_NEW_RM BIT_ULL(3) #define BNXT_FW_CAP_IF_CHANGE BIT_ULL(4) + #define BNXT_FW_CAP_ENABLE_RDMA_SRIOV BIT_ULL(5) + #define BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED BIT_ULL(6) #define BNXT_FW_CAP_KONG_MB_CHNL BIT_ULL(7) #define BNXT_FW_CAP_OVS_64BIT_HANDLE BIT_ULL(10) #define BNXT_FW_CAP_TRUSTED_VF BIT_ULL(11) @@ -2452,6 +2454,10 @@ struct bnxt { #define BNXT_SUPPORTS_QUEUE_API(bp) \ (BNXT_PF(bp) && BNXT_SUPPORTS_NTUPLE_VNIC(bp) && \ ((bp)->fw_cap & BNXT_FW_CAP_VNIC_RE_FLUSH)) +#define BNXT_RDMA_SRIOV_EN(bp) \ + ((bp)->fw_cap & BNXT_FW_CAP_ENABLE_RDMA_SRIOV) +#define BNXT_ROCE_VF_RESC_CAP(bp) \ + ((bp)->fw_cap & BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED) u32 hwrm_spec_code; u16 hwrm_cmd_seq; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 7bb8a5d74430..12b6ed51fd88 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -520,6 +520,56 @@ static int __bnxt_set_vf_params(struct bnxt *bp, int vf_id) return hwrm_req_send(bp, req); } +static void bnxt_hwrm_roce_sriov_cfg(struct bnxt *bp, int num_vfs) +{ + struct hwrm_func_qcaps_output *resp; + struct hwrm_func_cfg_input *cfg_req; + struct hwrm_func_qcaps_input *req; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_FUNC_QCAPS); + if (rc) + return; + + req->fid = cpu_to_le16(0xffff); + resp = hwrm_req_hold(bp, req); + rc = hwrm_req_send(bp, req); + if (rc) + goto err; + + rc = hwrm_req_init(bp, cfg_req, HWRM_FUNC_CFG); + if (rc) + goto err; + + cfg_req->fid = cpu_to_le16(0xffff); + cfg_req->enables2 = + cpu_to_le32(FUNC_CFG_REQ_ENABLES2_ROCE_MAX_AV_PER_VF | + FUNC_CFG_REQ_ENABLES2_ROCE_MAX_CQ_PER_VF | + FUNC_CFG_REQ_ENABLES2_ROCE_MAX_MRW_PER_VF | + FUNC_CFG_REQ_ENABLES2_ROCE_MAX_QP_PER_VF | + FUNC_CFG_REQ_ENABLES2_ROCE_MAX_SRQ_PER_VF | + FUNC_CFG_REQ_ENABLES2_ROCE_MAX_GID_PER_VF); + cfg_req->roce_max_av_per_vf = + cpu_to_le32(le32_to_cpu(resp->roce_vf_max_av) / num_vfs); + cfg_req->roce_max_cq_per_vf = + cpu_to_le32(le32_to_cpu(resp->roce_vf_max_cq) / num_vfs); + cfg_req->roce_max_mrw_per_vf = + cpu_to_le32(le32_to_cpu(resp->roce_vf_max_mrw) / num_vfs); + cfg_req->roce_max_qp_per_vf = + cpu_to_le32(le32_to_cpu(resp->roce_vf_max_qp) / num_vfs); + cfg_req->roce_max_srq_per_vf = + cpu_to_le32(le32_to_cpu(resp->roce_vf_max_srq) / num_vfs); + cfg_req->roce_max_gid_per_vf = + cpu_to_le32(le32_to_cpu(resp->roce_vf_max_gid) / num_vfs); + + rc = hwrm_req_send(bp, cfg_req); + +err: + hwrm_req_drop(bp, req); + if (rc) + netdev_err(bp->dev, "RoCE sriov configuration failed\n"); +} + /* Only called by PF to reserve resources for VFs, returns actual number of * VFs configured, or < 0 on error. */ @@ -759,6 +809,9 @@ int bnxt_cfg_hw_sriov(struct bnxt *bp, int *num_vfs, bool reset) *num_vfs = rc; } + if (BNXT_RDMA_SRIOV_EN(bp) && BNXT_ROCE_VF_RESC_CAP(bp)) + bnxt_hwrm_roce_sriov_cfg(bp, *num_vfs); + return 0; } From 304cc83807da5fd3044b0f20ed415027e40cd6e7 Mon Sep 17 00:00:00 2001 From: Bhargava Chenna Marreddy Date: Wed, 6 Nov 2024 00:44:35 -0800 Subject: [PATCH 38/51] RDMA/bnxt_re: Enhance RoCE SRIOV resource configuration design Refine RoCE SRIOV resource configuration design, using the INITIALIZE_FW's flag as an indication for the new design to the firmware. RoCE driver does not have to provision resources to VF when firmware advertises support for RoCE resource management by NIC driver. Signed-off-by: Bhargava Chenna Marreddy Signed-off-by: Kalesh AP Reviewed-by: Vikas Gupta Reviewed-by: Selvin Xavier CC: Michael Chan Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730882676-24434-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 13 ++++++++----- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 2 ++ drivers/infiniband/hw/bnxt_re/qplib_res.h | 3 +++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 1 + 6 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 4127227a9447..dd528dd63d53 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -184,6 +184,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) rdev->rcfw.res = &rdev->qplib_res; rdev->qplib_res.dattr = &rdev->dev_attr; rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); + rdev->qplib_res.en_dev = en_dev; bnxt_re_set_drv_mode(rdev); @@ -285,6 +286,10 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) { + /* + * Use the total VF count since the actual VF count may not be + * available at this point. + */ rdev->num_vfs = pci_sriov_get_totalvfs(rdev->en_dev->pdev); if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { bnxt_re_set_resource_limits(rdev); @@ -2056,11 +2061,9 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); - /* - * Use the total VF count since the actual VF count may not be - * available at this point. - */ - bnxt_re_vf_res_config(rdev); + + if (!(rdev->qplib_res.en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT)) + bnxt_re_vf_res_config(rdev); } hash_init(rdev->cq_hash); if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index f5713e3c39fb..005079b037f4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -910,6 +910,8 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + if (rcfw->res->en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT) + flags |= CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT; req.flags |= cpu_to_le16(flags); req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 115910c7e56d..21fb148713a6 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -39,6 +39,8 @@ #ifndef __BNXT_QPLIB_RES_H__ #define __BNXT_QPLIB_RES_H__ +#include "bnxt_ulp.h" + extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_57508 0x1750 @@ -302,6 +304,7 @@ struct bnxt_qplib_res { struct bnxt_qplib_chip_ctx *cctx; struct bnxt_qplib_dev_attr *dattr; struct net_device *netdev; + struct bnxt_en_dev *en_dev; struct bnxt_qplib_rcfw *rcfw; struct bnxt_qplib_pd_tbl pd_tbl; /* To protect the pd table bit map */ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index d9c53731871e..a98fc9c2313e 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -217,6 +217,7 @@ struct cmdq_initialize_fw { #define CMDQ_INITIALIZE_FW_FLAGS_MRAV_RESERVATION_SPLIT 0x1UL #define CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED 0x2UL #define CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED 0x8UL + #define CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT 0x10UL __le16 cookie; u8 resp_size; u8 reserved8; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index fdd6356f21ef..b771c84cdd89 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -414,6 +414,8 @@ static void bnxt_set_edev_info(struct bnxt_en_dev *edev, struct bnxt *bp) edev->flags |= BNXT_EN_FLAG_ROCEV2_CAP; if (bp->flags & BNXT_FLAG_VF) edev->flags |= BNXT_EN_FLAG_VF; + if (BNXT_ROCE_VF_RESC_CAP(bp)) + edev->flags |= BNXT_EN_FLAG_ROCE_VF_RES_MGMT; edev->chip_num = bp->chip_num; edev->hw_ring_stats_size = bp->hw_ring_stats_size; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index 4f4914f5c84c..5d6aac60f236 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -64,6 +64,7 @@ struct bnxt_en_dev { #define BNXT_EN_FLAG_ULP_STOPPED 0x8 #define BNXT_EN_FLAG_VF 0x10 #define BNXT_EN_VF(edev) ((edev)->flags & BNXT_EN_FLAG_VF) + #define BNXT_EN_FLAG_ROCE_VF_RES_MGMT 0x20 struct bnxt_ulp *ulp_tbl; int l2_db_size; /* Doorbell BAR size in From cdb21c12adcb9eaf97ac085fd0d1382f9830224b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 6 Nov 2024 00:44:36 -0800 Subject: [PATCH 39/51] RDMA/bnxt_re: Add set_func_resources support for P5/P7 adapters Enable set_func_resources for P5 and P7 adapters to handle VF resource distribution. Remove setting max resources per VF during PF initialization. This change is required for firmwares which does not support RoCE VF resource management by NIC driver. The code is same for all adapters now. Reviewed-by: Stephen Shi Reviewed-by: Rukhsana Ansari Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1730882676-24434-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 11 ++++++----- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 11 +---------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index dd528dd63d53..cb61941672f6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -291,11 +291,12 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) * available at this point. */ rdev->num_vfs = pci_sriov_get_totalvfs(rdev->en_dev->pdev); - if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { - bnxt_re_set_resource_limits(rdev); - bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx); - } + if (!rdev->num_vfs) + return; + + bnxt_re_set_resource_limits(rdev); + bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, + &rdev->qplib_ctx); } static void bnxt_re_shutdown(struct auxiliary_device *adev) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 005079b037f4..70729916468d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -851,10 +851,8 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, * shall setup this area for VF. Skipping the * HW programming */ - if (is_virtfn) + if (is_virtfn || bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) goto skip_ctx_setup; - if (bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) - goto config_vf_res; lvl = ctx->qpc_tbl.level; pgsz = bnxt_qplib_base_pg_size(&ctx->qpc_tbl); @@ -898,13 +896,6 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, req.number_of_srq = cpu_to_le32(ctx->srqc_tbl.max_elements); req.number_of_cq = cpu_to_le32(ctx->cq_tbl.max_elements); -config_vf_res: - req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf); - req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf); - req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf); - req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf); - req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf); - skip_ctx_setup: if (BNXT_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; From faa62440a5772b40bb7d78bf9e29556a82ecf153 Mon Sep 17 00:00:00 2001 From: Feng Fang Date: Tue, 12 Nov 2024 13:55:53 +0800 Subject: [PATCH 40/51] RDMA/hns: Fix different dgids mapping to the same dip_idx DIP algorithm requires a one-to-one mapping between dgid and dip_idx. Currently a queue 'spare_idx' is used to store QPN of QPs that use DIP algorithm. For a new dgid, use a QPN from spare_idx as dip_idx. This method lacks a mechanism for deduplicating QPN, which may result in different dgids sharing the same dip_idx and break the one-to-one mapping requirement. This patch replaces spare_idx with xarray and introduces a refcnt of a dip_idx to indicate the number of QPs that using this dip_idx. The state machine for dip_idx management is implemented as: * The entry at an index in xarray is empty -- This indicates that the corresponding dip_idx hasn't been created. * The entry at an index in xarray is not empty but with 0 refcnt -- This indicates that the corresponding dip_idx has been created but not used as dip_idx yet. * The entry at an index in xarray is not empty and with non-0 refcnt -- This indicates that the corresponding dip_idx is being used by refcnt number of DIP QPs. Fixes: eb653eda1e91 ("RDMA/hns: Bugfix for incorrect association between dip_idx and dgid") Fixes: f91696f2f053 ("RDMA/hns: Support congestion control type selection according to the FW") Signed-off-by: Feng Fang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241112055553.3681129-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 11 +-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 96 +++++++++++++++------ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 2 - drivers/infiniband/hw/hns/hns_roce_qp.c | 8 +- 5 files changed, 75 insertions(+), 44 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 9b51d5a1533f..560a1d9de408 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -489,12 +489,6 @@ struct hns_roce_bank { u32 next; /* Next ID to allocate. */ }; -struct hns_roce_idx_table { - u32 *spare_idx; - u32 head; - u32 tail; -}; - struct hns_roce_qp_table { struct hns_roce_hem_table qp_table; struct hns_roce_hem_table irrl_table; @@ -503,7 +497,7 @@ struct hns_roce_qp_table { struct mutex scc_mutex; struct hns_roce_bank bank[HNS_ROCE_QP_BANK_NUM]; struct mutex bank_mutex; - struct hns_roce_idx_table idx_table; + struct xarray dip_xa; }; struct hns_roce_cq_table { @@ -658,6 +652,7 @@ struct hns_roce_qp { u8 tc_mode; u8 priority; spinlock_t flush_lock; + struct hns_roce_dip *dip; }; struct hns_roce_ib_iboe { @@ -984,8 +979,6 @@ struct hns_roce_dev { enum hns_roce_device_state state; struct list_head qp_list; /* list of all qps on this dev */ spinlock_t qp_list_lock; /* protect qp_list */ - struct list_head dip_list; /* list of all dest ips on this dev */ - spinlock_t dip_list_lock; /* protect dip_list */ struct list_head pgdir_list; struct mutex pgdir_mutex; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 707e96ce222c..697b17cca02e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2553,20 +2553,19 @@ static void hns_roce_free_link_table(struct hns_roce_dev *hr_dev) free_link_table_buf(hr_dev, &priv->ext_llm); } -static void free_dip_list(struct hns_roce_dev *hr_dev) +static void free_dip_entry(struct hns_roce_dev *hr_dev) { struct hns_roce_dip *hr_dip; - struct hns_roce_dip *tmp; - unsigned long flags; + unsigned long idx; - spin_lock_irqsave(&hr_dev->dip_list_lock, flags); + xa_lock(&hr_dev->qp_table.dip_xa); - list_for_each_entry_safe(hr_dip, tmp, &hr_dev->dip_list, node) { - list_del(&hr_dip->node); + xa_for_each(&hr_dev->qp_table.dip_xa, idx, hr_dip) { + __xa_erase(&hr_dev->qp_table.dip_xa, hr_dip->dip_idx); kfree(hr_dip); } - spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); + xa_unlock(&hr_dev->qp_table.dip_xa); } static struct ib_pd *free_mr_init_pd(struct hns_roce_dev *hr_dev) @@ -2974,7 +2973,7 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) hns_roce_free_link_table(hr_dev); if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09) - free_dip_list(hr_dev); + free_dip_entry(hr_dev); } static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, @@ -4694,26 +4693,49 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, int attr_mask, return 0; } +static int alloc_dip_entry(struct xarray *dip_xa, u32 qpn) +{ + struct hns_roce_dip *hr_dip; + int ret; + + hr_dip = xa_load(dip_xa, qpn); + if (hr_dip) + return 0; + + hr_dip = kzalloc(sizeof(*hr_dip), GFP_KERNEL); + if (!hr_dip) + return -ENOMEM; + + ret = xa_err(xa_store(dip_xa, qpn, hr_dip, GFP_KERNEL)); + if (ret) + kfree(hr_dip); + + return ret; +} + static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, u32 *dip_idx) { const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); - u32 *spare_idx = hr_dev->qp_table.idx_table.spare_idx; - u32 *head = &hr_dev->qp_table.idx_table.head; - u32 *tail = &hr_dev->qp_table.idx_table.tail; + struct xarray *dip_xa = &hr_dev->qp_table.dip_xa; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_dip *hr_dip; - unsigned long flags; + unsigned long idx; int ret = 0; - spin_lock_irqsave(&hr_dev->dip_list_lock, flags); + ret = alloc_dip_entry(dip_xa, ibqp->qp_num); + if (ret) + return ret; - spare_idx[*tail] = ibqp->qp_num; - *tail = (*tail == hr_dev->caps.num_qps - 1) ? 0 : (*tail + 1); + xa_lock(dip_xa); - list_for_each_entry(hr_dip, &hr_dev->dip_list, node) { - if (!memcmp(grh->dgid.raw, hr_dip->dgid, GID_LEN_V2)) { + xa_for_each(dip_xa, idx, hr_dip) { + if (hr_dip->qp_cnt && + !memcmp(grh->dgid.raw, hr_dip->dgid, GID_LEN_V2)) { *dip_idx = hr_dip->dip_idx; + hr_dip->qp_cnt++; + hr_qp->dip = hr_dip; goto out; } } @@ -4721,19 +4743,24 @@ static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, /* If no dgid is found, a new dip and a mapping between dgid and * dip_idx will be created. */ - hr_dip = kzalloc(sizeof(*hr_dip), GFP_ATOMIC); - if (!hr_dip) { - ret = -ENOMEM; - goto out; + xa_for_each(dip_xa, idx, hr_dip) { + if (hr_dip->qp_cnt) + continue; + + *dip_idx = idx; + memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); + hr_dip->dip_idx = idx; + hr_dip->qp_cnt++; + hr_qp->dip = hr_dip; + break; } - memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); - hr_dip->dip_idx = *dip_idx = spare_idx[*head]; - *head = (*head == hr_dev->caps.num_qps - 1) ? 0 : (*head + 1); - list_add_tail(&hr_dip->node, &hr_dev->dip_list); + /* This should never happen. */ + if (WARN_ON_ONCE(!hr_qp->dip)) + ret = -ENOSPC; out: - spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); + xa_unlock(dip_xa); return ret; } @@ -5587,6 +5614,20 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, return ret; } +static void put_dip_ctx_idx(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dip *hr_dip = hr_qp->dip; + + xa_lock(&hr_dev->qp_table.dip_xa); + + hr_dip->qp_cnt--; + if (!hr_dip->qp_cnt) + memset(hr_dip->dgid, 0, GID_LEN_V2); + + xa_unlock(&hr_dev->qp_table.dip_xa); +} + int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); @@ -5600,6 +5641,9 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) spin_unlock_irqrestore(&hr_qp->flush_lock, flags); flush_work(&hr_qp->flush_work.work); + if (hr_qp->cong_type == CONG_TYPE_DIP) + put_dip_ctx_idx(hr_dev, hr_qp); + ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata); if (ret) ibdev_err_ratelimited(&hr_dev->ib_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index dedb1853e193..cbdbc9edbce6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1348,7 +1348,7 @@ struct hns_roce_v2_priv { struct hns_roce_dip { u8 dgid[GID_LEN_V2]; u32 dip_idx; - struct list_head node; /* all dips are on a list */ + u32 qp_cnt; }; struct fmea_ram_ecc { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 49315f39361d..ae24c81c9812 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -1135,8 +1135,6 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) INIT_LIST_HEAD(&hr_dev->qp_list); spin_lock_init(&hr_dev->qp_list_lock); - INIT_LIST_HEAD(&hr_dev->dip_list); - spin_lock_init(&hr_dev->dip_list_lock); ret = hns_roce_register_device(hr_dev); if (ret) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 2ad03ecdbf8e..9e2e76c59406 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1573,14 +1573,10 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) unsigned int reserved_from_bot; unsigned int i; - qp_table->idx_table.spare_idx = kcalloc(hr_dev->caps.num_qps, - sizeof(u32), GFP_KERNEL); - if (!qp_table->idx_table.spare_idx) - return -ENOMEM; - mutex_init(&qp_table->scc_mutex); mutex_init(&qp_table->bank_mutex); xa_init(&hr_dev->qp_table_xa); + xa_init(&qp_table->dip_xa); reserved_from_bot = hr_dev->caps.reserved_qps; @@ -1605,7 +1601,7 @@ void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) ida_destroy(&hr_dev->qp_table.bank[i].ida); + xa_destroy(&hr_dev->qp_table.dip_xa); mutex_destroy(&hr_dev->qp_table.bank_mutex); mutex_destroy(&hr_dev->qp_table.scc_mutex); - kfree(hr_dev->qp_table.idx_table.spare_idx); } From 65ecee132774e0f15cd76a766eb39ec21118bffc Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 14 Nov 2024 01:49:05 -0800 Subject: [PATCH 41/51] RDMA/bnxt_re: Fail probe early when not enough MSI-x vectors are reserved L2 driver allocates and populates the MSI-x vector details for RoCE in the en_dev structure. RoCE driver requires minimum 2 MSIx vectors. Hence during probe, driver has to check and bail out if there are not enough MSI-x vectors reserved for it before proceeding further initialization. Reviewed-by: Andy Gospodarek Reviewed-by: Ajit Khaparde Reviewed-by: Hongguang Gao Reviewed-by: Bhargava Chenna Marreddy Reviewed-by: Kashyap Desai Reviewed-by: Chandramohan Akula Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1731577748-1804-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/main.c | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index d1b7c2029bdc..7abc37bed6d5 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -154,6 +154,8 @@ struct bnxt_re_pacing { #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 +#define BNXT_RE_MIN_MSIX 2 + #define MAX_CQ_HASH_BITS (16) #define MAX_SRQ_HASH_BITS (16) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index cb61941672f6..c262a16e4219 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1936,6 +1936,18 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + if (rdev->en_dev->ulp_tbl->msix_requested < BNXT_RE_MIN_MSIX) { + ibdev_err(&rdev->ibdev, + "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", + rdev->en_dev->ulp_tbl->msix_requested); + bnxt_unregister_dev(rdev->en_dev); + clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -EINVAL; + } + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->en_dev->ulp_tbl->msix_requested); + rdev->num_msix = rdev->en_dev->ulp_tbl->msix_requested; + rc = bnxt_re_setup_chip_ctx(rdev); if (rc) { bnxt_unregister_dev(rdev->en_dev); @@ -1947,16 +1959,6 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); - if (!rdev->en_dev->ulp_tbl->msix_requested) { - ibdev_err(&rdev->ibdev, - "Failed to get MSI-X vectors: %#x\n", rc); - rc = -EINVAL; - goto fail; - } - ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", - rdev->en_dev->ulp_tbl->msix_requested); - rdev->num_msix = rdev->en_dev->ulp_tbl->msix_requested; - bnxt_re_query_hwrm_intf_version(rdev); /* Establish RCFW Communication Channel to initialize the context From 30b871338c3ebab4c5efb74f6b23b59f1ac4ca1f Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 14 Nov 2024 01:49:06 -0800 Subject: [PATCH 42/51] RDMA/bnxt_re: Refactor NQ allocation Move NQ related data structures from rdev to a new structure named "struct bnxt_re_nq_record" by keeping a pointer to in the rdev structure. Allocate the memory for it dynamically. This change is needed for subsequent patches in the series. Also, removed the nq_task variable from rdev structure as it is redundant and no longer used. This change would help to reduce the size of the driver private structure as well. Reviewed-by: Chandramohan Akula Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1731577748-1804-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 13 +++-- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 6 +- drivers/infiniband/hw/bnxt_re/main.c | 74 ++++++++++++++++-------- 3 files changed, 60 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 7abc37bed6d5..d1c839efab05 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -155,6 +155,11 @@ struct bnxt_re_pacing { #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 #define BNXT_RE_MIN_MSIX 2 +#define BNXT_RE_MAX_MSIX BNXT_MAX_ROCE_MSIX +struct bnxt_re_nq_record { + struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + int num_msix; +}; #define MAX_CQ_HASH_BITS (16) #define MAX_SRQ_HASH_BITS (16) @@ -183,21 +188,17 @@ struct bnxt_re_dev { unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - int num_msix; int id; struct delayed_work worker; u8 cur_prio_map; - /* FP Notification Queue (CQ & SRQ) */ - struct tasklet_struct nq_task; - /* RCFW Channel */ struct bnxt_qplib_rcfw rcfw; - /* NQ */ - struct bnxt_qplib_nq nq[BNXT_MAX_ROCE_MSIX]; + /* NQ record */ + struct bnxt_re_nq_record *nqr; /* Device Resources */ struct bnxt_qplib_dev_attr dev_attr; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 9a188ccd4ead..a9c32c031705 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1872,8 +1872,8 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, srq->qplib_srq.wqe_size = bnxt_re_get_rwqe_size(dev_attr->max_srq_sges); srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; srq->srq_limit = srq_init_attr->attr.srq_limit; - srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id; - nq = &rdev->nq[0]; + srq->qplib_srq.eventq_hw_ring_id = rdev->nqr->nq[0].ring_id; + nq = &rdev->nqr->nq[0]; if (udata) { rc = bnxt_re_init_user_srq(rdev, pd, srq, udata); @@ -3122,7 +3122,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, * used for getting the NQ index. */ nq_alloc_cnt = atomic_inc_return(&rdev->nq_alloc_cnt); - nq = &rdev->nq[nq_alloc_cnt % (rdev->num_msix - 1)]; + nq = &rdev->nqr->nq[nq_alloc_cnt % (rdev->nqr->num_msix - 1)]; cq->qplib_cq.max_wqe = entries; cq->qplib_cq.cnq_hw_ring_id = nq->ring_id; cq->qplib_cq.nq = nq; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index c262a16e4219..9669defcebf6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -326,8 +326,8 @@ static void bnxt_re_stop_irq(void *handle) rdev = en_info->rdev; rcfw = &rdev->rcfw; - for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) { - nq = &rdev->nq[indx - 1]; + for (indx = BNXT_RE_NQ_IDX; indx < rdev->nqr->num_msix; indx++) { + nq = &rdev->nqr->nq[indx - 1]; bnxt_qplib_nq_stop_irq(nq, false); } @@ -362,7 +362,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) /* Vectors may change after restart, so update with new vectors * in device sctructure. */ - for (indx = 0; indx < rdev->num_msix; indx++) + for (indx = 0; indx < rdev->nqr->num_msix; indx++) rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, @@ -371,8 +371,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) ibdev_warn(&rdev->ibdev, "Failed to reinit CREQ\n"); return; } - for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) { - nq = &rdev->nq[indx - 1]; + for (indx = BNXT_RE_NQ_IDX ; indx < rdev->nqr->num_msix; indx++) { + nq = &rdev->nqr->nq[indx - 1]; rc = bnxt_qplib_nq_start_irq(nq, indx - 1, msix_ent[indx].vector, false); if (rc) { @@ -1206,7 +1206,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) addrconf_addr_eui48((u8 *)&ibdev->node_guid, rdev->netdev->dev_addr); - ibdev->num_comp_vectors = rdev->num_msix - 1; + ibdev->num_comp_vectors = rdev->nqr->num_msix - 1; ibdev->dev.parent = &rdev->en_dev->pdev->dev; ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; @@ -1551,8 +1551,8 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { int i; - for (i = 1; i < rdev->num_msix; i++) - bnxt_qplib_disable_nq(&rdev->nq[i - 1]); + for (i = 1; i < rdev->nqr->num_msix; i++) + bnxt_qplib_disable_nq(&rdev->nqr->nq[i - 1]); if (rdev->qplib_res.rcfw) bnxt_qplib_cleanup_res(&rdev->qplib_res); @@ -1566,9 +1566,9 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) bnxt_qplib_init_res(&rdev->qplib_res); - for (i = 1; i < rdev->num_msix ; i++) { + for (i = 1; i < rdev->nqr->num_msix ; i++) { db_offt = rdev->en_dev->msix_entries[i].db_offset; - rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], + rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nqr->nq[i - 1], i - 1, rdev->en_dev->msix_entries[i].vector, db_offt, &bnxt_re_cqn_handler, &bnxt_re_srqn_handler); @@ -1582,20 +1582,22 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) return 0; fail: for (i = num_vec_enabled; i >= 0; i--) - bnxt_qplib_disable_nq(&rdev->nq[i]); + bnxt_qplib_disable_nq(&rdev->nqr->nq[i]); return rc; } static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_nq *nq; u8 type; int i; - for (i = 0; i < rdev->num_msix - 1; i++) { + for (i = 0; i < rdev->nqr->num_msix - 1; i++) { type = bnxt_qplib_get_ring_type(rdev->chip_ctx); - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); - bnxt_qplib_free_nq(&rdev->nq[i]); - rdev->nq[i].res = NULL; + nq = &rdev->nqr->nq[i]; + bnxt_re_net_ring_free(rdev, nq->ring_id, type); + bnxt_qplib_free_nq(nq); + nq->res = NULL; } } @@ -1637,12 +1639,12 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) goto dealloc_res; - for (i = 0; i < rdev->num_msix - 1; i++) { + for (i = 0; i < rdev->nqr->num_msix - 1; i++) { struct bnxt_qplib_nq *nq; - nq = &rdev->nq[i]; + nq = &rdev->nqr->nq[i]; nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT; - rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]); + rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, nq); if (rc) { ibdev_err(&rdev->ibdev, "Alloc Failed NQ%d rc:%#x", i, rc); @@ -1650,7 +1652,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) } type = bnxt_qplib_get_ring_type(rdev->chip_ctx); rattr.dma_arr = nq->hwq.pbl[PBL_LVL_0].pg_map_arr; - rattr.pages = nq->hwq.pbl[rdev->nq[i].hwq.level].pg_count; + rattr.pages = nq->hwq.pbl[rdev->nqr->nq[i].hwq.level].pg_count; rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1; @@ -1660,7 +1662,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) ibdev_err(&rdev->ibdev, "Failed to allocate NQ fw id with rc = 0x%x", rc); - bnxt_qplib_free_nq(&rdev->nq[i]); + bnxt_qplib_free_nq(nq); goto free_nq; } num_vec_created++; @@ -1669,8 +1671,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) free_nq: for (i = num_vec_created - 1; i >= 0; i--) { type = bnxt_qplib_get_ring_type(rdev->chip_ctx); - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); - bnxt_qplib_free_nq(&rdev->nq[i]); + bnxt_re_net_ring_free(rdev, rdev->nqr->nq[i].ring_id, type); + bnxt_qplib_free_nq(&rdev->nqr->nq[i]); } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &rdev->dpi_privileged); @@ -1865,6 +1867,21 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } +static int bnxt_re_alloc_nqr_mem(struct bnxt_re_dev *rdev) +{ + rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL); + if (!rdev->nqr) + return -ENOMEM; + + return 0; +} + +static void bnxt_re_free_nqr_mem(struct bnxt_re_dev *rdev) +{ + kfree(rdev->nqr); + rdev->nqr = NULL; +} + static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) { u8 type; @@ -1894,11 +1911,12 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } - rdev->num_msix = 0; + rdev->nqr->num_msix = 0; if (rdev->pacing.dbr_pacing) bnxt_re_deinitialize_dbr_pacing(rdev); + bnxt_re_free_nqr_mem(rdev); bnxt_re_destroy_chip_ctx(rdev); if (op_type == BNXT_RE_COMPLETE_REMOVE) { if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) @@ -1946,7 +1964,6 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) } ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", rdev->en_dev->ulp_tbl->msix_requested); - rdev->num_msix = rdev->en_dev->ulp_tbl->msix_requested; rc = bnxt_re_setup_chip_ctx(rdev); if (rc) { @@ -1956,6 +1973,15 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) return -EINVAL; } + rc = bnxt_re_alloc_nqr_mem(rdev); + if (rc) { + bnxt_re_destroy_chip_ctx(rdev); + bnxt_unregister_dev(rdev->en_dev); + clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return rc; + } + rdev->nqr->num_msix = rdev->en_dev->ulp_tbl->msix_requested; + /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); From cb97b377a13589b4880eeb1524dbc47087c3244b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 14 Nov 2024 01:49:07 -0800 Subject: [PATCH 43/51] RDMA/bnxt_re: Refurbish CQ to NQ hash calculation There are few use cases where CQ create and destroy is seen before re-creating the CQ, this kind of use case is disturbing the RR distribution and all the active CQ getting mapped to only 2 NQ alternatively. Fixing the CQ to NQ hash calculation by implementing a quick load sorting mechanism under a mutex. Using this, if the CQ was allocated and destroyed before using it, the nq selecting algorithm still obtains the least loaded CQ. Thus balancing the load on NQs. Signed-off-by: Selvin Xavier Signed-off-by: Kalesh AP Link: https://patch.msgid.link/1731577748-1804-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 37 +++++++++++++++++------- drivers/infiniband/hw/bnxt_re/main.c | 2 ++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index d1c839efab05..5f64fc496812 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -159,6 +159,8 @@ struct bnxt_re_pacing { struct bnxt_re_nq_record { struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; int num_msix; + /* serialize NQ access */ + struct mutex load_lock; }; #define MAX_CQ_HASH_BITS (16) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a9c32c031705..f6e9eefc95d6 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3029,6 +3029,28 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, return rc; } +static struct bnxt_qplib_nq *bnxt_re_get_nq(struct bnxt_re_dev *rdev) +{ + int min, indx; + + mutex_lock(&rdev->nqr->load_lock); + for (indx = 0, min = 0; indx < (rdev->nqr->num_msix - 1); indx++) { + if (rdev->nqr->nq[min].load > rdev->nqr->nq[indx].load) + min = indx; + } + rdev->nqr->nq[min].load++; + mutex_unlock(&rdev->nqr->load_lock); + + return &rdev->nqr->nq[min]; +} + +static void bnxt_re_put_nq(struct bnxt_re_dev *rdev, struct bnxt_qplib_nq *nq) +{ + mutex_lock(&rdev->nqr->load_lock); + nq->load--; + mutex_unlock(&rdev->nqr->load_lock); +} + /* Completion Queues */ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { @@ -3047,6 +3069,8 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) hash_del(&cq->hash_entry); } bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); + + bnxt_re_put_nq(rdev, nq); ib_umem_release(cq->umem); atomic_dec(&rdev->stats.res.cq_count); @@ -3065,8 +3089,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; struct bnxt_qplib_chip_ctx *cctx; - struct bnxt_qplib_nq *nq = NULL; - unsigned int nq_alloc_cnt; int cqe = attr->cqe; int rc, entries; u32 active_cqs; @@ -3117,16 +3139,10 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->qplib_cq.dpi = &rdev->dpi_privileged; } - /* - * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a - * used for getting the NQ index. - */ - nq_alloc_cnt = atomic_inc_return(&rdev->nq_alloc_cnt); - nq = &rdev->nqr->nq[nq_alloc_cnt % (rdev->nqr->num_msix - 1)]; cq->qplib_cq.max_wqe = entries; - cq->qplib_cq.cnq_hw_ring_id = nq->ring_id; - cq->qplib_cq.nq = nq; cq->qplib_cq.coalescing = &rdev->cq_coalescing; + cq->qplib_cq.nq = bnxt_re_get_nq(rdev); + cq->qplib_cq.cnq_hw_ring_id = cq->qplib_cq.nq->ring_id; rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -3136,7 +3152,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ib_cq.cqe = entries; cq->cq_period = cq->qplib_cq.period; - nq->budget++; active_cqs = atomic_inc_return(&rdev->stats.res.cq_count); if (active_cqs > rdev->stats.res.cq_watermark) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 9669defcebf6..fcaf2b3cc01d 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1566,6 +1566,8 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) bnxt_qplib_init_res(&rdev->qplib_res); + mutex_init(&rdev->nqr->load_lock); + for (i = 1; i < rdev->nqr->num_msix ; i++) { db_offt = rdev->en_dev->msix_entries[i].db_offset; rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nqr->nq[i - 1], diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e2eea714e977..e56f42fddefe 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -551,6 +551,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, nq->pdev = pdev; nq->cqn_handler = cqn_handler; nq->srqn_handler = srqn_handler; + nq->load = 0; /* Have a task to schedule CQ notifiers in post send case */ nq->cqn_wq = create_singlethread_workqueue("bnxt_qplib_nq"); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index b5a905819ecc..8ff56d7320ca 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -519,6 +519,7 @@ struct bnxt_qplib_nq { struct tasklet_struct nq_tasklet; bool requested; int budget; + u32 load; cqn_handler_t cqn_handler; srqn_handler_t srqn_handler; From 31bad59805c388f92f3a13174a149c2228301c15 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 14 Nov 2024 01:49:08 -0800 Subject: [PATCH 44/51] RDMA/bnxt_re: Cache MSIx info to a local structure L2 driver allocates the vectors for RoCE and pass it through the en_dev structure to RoCE. During probe, cache the MSIx related info to a local structure. Signed-off-by: Selvin Xavier Signed-off-by: Kalesh AP Link: https://patch.msgid.link/1731577748-1804-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 5f64fc496812..2975b11b79bf 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -157,6 +157,7 @@ struct bnxt_re_pacing { #define BNXT_RE_MIN_MSIX 2 #define BNXT_RE_MAX_MSIX BNXT_MAX_ROCE_MSIX struct bnxt_re_nq_record { + struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX]; struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; int num_msix; /* serialize NQ access */ diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index fcaf2b3cc01d..533b9f110d24 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -347,7 +347,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) return; rdev = en_info->rdev; - msix_ent = rdev->en_dev->msix_entries; + msix_ent = rdev->nqr->msix_entries; rcfw = &rdev->rcfw; if (!ent) { /* Not setting the f/w timeout bit in rcfw. @@ -363,7 +363,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) * in device sctructure. */ for (indx = 0; indx < rdev->nqr->num_msix; indx++) - rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; + rdev->nqr->msix_entries[indx].vector = ent[indx].vector; rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, false); @@ -1569,9 +1569,9 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) mutex_init(&rdev->nqr->load_lock); for (i = 1; i < rdev->nqr->num_msix ; i++) { - db_offt = rdev->en_dev->msix_entries[i].db_offset; + db_offt = rdev->nqr->msix_entries[i].db_offset; rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nqr->nq[i - 1], - i - 1, rdev->en_dev->msix_entries[i].vector, + i - 1, rdev->nqr->msix_entries[i].vector, db_offt, &bnxt_re_cqn_handler, &bnxt_re_srqn_handler); if (rc) { @@ -1658,7 +1658,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1; - rattr.lrid = rdev->en_dev->msix_entries[i + 1].ring_idx; + rattr.lrid = rdev->nqr->msix_entries[i + 1].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, @@ -1983,6 +1983,8 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) return rc; } rdev->nqr->num_msix = rdev->en_dev->ulp_tbl->msix_requested; + memcpy(rdev->nqr->msix_entries, rdev->en_dev->msix_entries, + sizeof(struct bnxt_msix_entry) * rdev->nqr->num_msix); /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); @@ -2008,14 +2010,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1; - rattr.lrid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; + rattr.lrid = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); goto free_rcfw; } - db_offt = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].db_offset; - vid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].vector; + db_offt = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].db_offset; + vid = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].vector; rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw, vid, db_offt, &bnxt_re_aeq_handler); From ede132a5cf559f3ab35a4c28bac4f4a6c20334d8 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 13 Nov 2024 13:23:19 +0200 Subject: [PATCH 45/51] RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47e71e5 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad Reviewed-by: Michael Guralnik Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 40 +++++++++++++--------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 65da5df05d02..bc7930d0c564 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3005,7 +3005,6 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; int ret; if (!MLX5_CAP_GEN(dev->mdev, xrc)) @@ -3021,10 +3020,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) return ret; } - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - INIT_WORK(&devr->ports[port].pkey_change_work, - pkey_change_handler); - mutex_init(&devr->cq_lock); mutex_init(&devr->srq_lock); @@ -3034,16 +3029,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; - - /* - * Make sure no change P_Key work items are still executing. - * - * At this stage, the mlx5_ib_event should be unregistered - * and it ensures that no new works are added. - */ - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - cancel_work_sync(&devr->ports[port].pkey_change_work); /* After s0/s1 init, they are not unset during the device lifetime. */ if (devr->s1) { @@ -4480,6 +4465,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + dev->mdev_events.notifier_call = mlx5_ib_event; mlx5_notifier_register(dev->mdev, &dev->mdev_events); @@ -4490,8 +4482,14 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + mlx5r_macsec_event_unregister(dev); mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, @@ -4581,9 +4579,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, mlx5_ib_odp_init_one, mlx5_ib_odp_cleanup_one), @@ -4608,6 +4603,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4644,9 +4642,6 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_counters_init, mlx5_ib_counters_cleanup), @@ -4668,6 +4663,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index ed4eaaa7ac71..a01b592aa716 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -973,7 +973,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, - MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, @@ -982,6 +981,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, MLX5_IB_STAGE_DELAY_DROP, MLX5_IB_STAGE_RESTRACK, From 0492458750c9fbd69cfc7baddd3ddcac77f2a0c8 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 13 Nov 2024 13:12:54 +0200 Subject: [PATCH 46/51] IB/cm: Explicitly mark if a response MAD is a retransmission In several situations the CM may send a reply to a received MAD without the reply being directly linked with a cm_id. For example, it may send a REJ in response to a REQ which does not match a listener. Or, it may send a DREP in response to a DREQ if the cm_id has already been destroyed. This can happen if the original DREP was lost and the DREQ was retried. When such a response MAD completes, it updates a counter tracking how many MADs were retried. However, not all response MADs issued directly by the CM may be retries. The REJ mentioned in the example above is such a case. To distinguish between responses which were retries versus those that are not, the send_handler performs the following check: is a retry if the response is not associated with a cm_id and the response is not a REJ message. Replace this indirect method of checking if a response is a retry with an explicit check. Note that these retries are generated directly by the CM, rather than retried by the MAD layer. This change will be needed by later changes which would otherwise break the indirect check. Signed-off-by: Sean Hefty Signed-off-by: Or Har-Toov Signed-off-by: Vlad Dumitrescu Link: https://patch.msgid.link/1ee6e2a68f8de1992b9da23aa1d7e3f9f25e0036.1731495873.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 51 ++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 07fb8d3c037f..99246e49dd3a 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -35,6 +35,8 @@ MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); #define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ +#define CM_DIRECT_RETRY_CTX ((void *) 1UL) + static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_NO_QP] = "no QP", [IB_CM_REJ_NO_EEC] = "no EEC", @@ -358,13 +360,20 @@ static void cm_free_priv_msg(struct ib_mad_send_buf *msg) ib_free_send_mad(msg); } -static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc) +static struct ib_mad_send_buf * +cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + bool direct_retry) { - return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, - 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, - IB_MGMT_BASE_VERSION); + struct ib_mad_send_buf *m; + + m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, IB_MGMT_BASE_VERSION); + if (!IS_ERR(m)) + m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL; + + return m; } static int cm_create_response_msg_ah(struct cm_port *port, @@ -384,12 +393,13 @@ static int cm_create_response_msg_ah(struct cm_port *port, static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, + bool direct_retry, struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; int ret; - m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry); if (IS_ERR(m)) return PTR_ERR(m); @@ -1598,7 +1608,7 @@ static int cm_issue_rej(struct cm_port *port, struct cm_rej_msg *rej_msg, *rcv_msg; int ret; - ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg); if (ret) return ret; @@ -1951,7 +1961,7 @@ static void cm_dup_req_handler(struct cm_work *work, } spin_unlock_irq(&cm_id_priv->lock); - ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) return; @@ -2444,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work) atomic_long_inc( &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]); - ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) goto deref; @@ -2791,7 +2801,7 @@ static int cm_issue_drep(struct cm_port *port, struct cm_drep_msg *drep_msg; int ret; - ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg); if (ret) return ret; @@ -2856,7 +2866,8 @@ static int cm_dreq_handler(struct cm_work *work) case IB_CM_TIMEWAIT: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_DREQ_COUNTER]); - msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, + true); if (IS_ERR(msg)) goto unlock; @@ -3361,7 +3372,8 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_MRA_LAP_SENT: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] [CM_LAP_COUNTER]); - msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, + true); if (IS_ERR(msg)) goto unlock; @@ -3826,7 +3838,7 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; - struct cm_id_private *cm_id_priv = msg->context[0]; + struct cm_id_private *cm_id_priv; enum ib_cm_state state = (enum ib_cm_state)(unsigned long)msg->context[1]; struct cm_port *port; @@ -3836,13 +3848,12 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, attr_index = be16_to_cpu(((struct ib_mad_hdr *) msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; - /* - * If the send was in response to a received message (context[0] is not - * set to a cm_id), and is not a REJ, then it is a send that was - * manually retried. - */ - if (!cm_id_priv && (attr_index != CM_REJ_COUNTER)) + if (msg->context[0] == CM_DIRECT_RETRY_CTX) { msg->retries = 1; + cm_id_priv = NULL; + } else { + cm_id_priv = msg->context[0]; + } atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]); if (msg->retries) From 1e5159219076ddb2e44338c667c83fd1bd43dfef Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 13 Nov 2024 13:12:55 +0200 Subject: [PATCH 47/51] IB/cm: Do not hold reference on cm_id unless needed Typically, when the CM sends a MAD it bumps a reference count on the associated cm_id. There are some exceptions, such as when the MAD is a direct response to a receive MAD. For example, the CM may generate an MRA in response to a duplicate REQ. But, in general, if a MAD may be sent as a result of the user invoking an API call (e.g. ib_send_cm_rep(), ib_send_cm_rtu(), etc.), a reference is taken on the cm_id. This reference is necessary if the MAD requires a response. The reference allows routing a response MAD back to the cm_id, or, if no response is received, allows updating the cm_id state to reflect the failure. For MADs which do not generate a response from the target, however, there's no need to hold a reference on the cm_id. Such MADs will not be retried by the MAD layer and their completions do not change the state of the cm_id. There are 2 internal calls used to allocate MADs which take a reference on the cm_id: cm_alloc_msg() and cm_alloc_priv_msg(). The latter calls the former. It turns out that all other places where cm_alloc_msg() is called are for MADs that do not generate a response from the target: sending an RTU, DREP, REJ, MRA, or SIDR REP. In all of these cases, there's no need to hold a reference on the cm_id. The benefit of dropping unneeded references is that it allows destruction of the cm_id to proceed immediately. Currently, the cm_destroy_id() call blocks as long as there's a reference held on the cm_id. Worse, is that cm_destroy_id() will send MADs, which it then needs to complete. Sending the MADs is beneficial, as they notify the peer that a connection is being destroyed. However, since the MADs hold a reference on the cm_id, they block destruction and cannot be retried. Move cm_id referencing from cm_alloc_msg() to cm_alloc_priv_msg(). The latter should hold a reference on the cm_id in all cases but one, which will be handled in a separate patch. cm_alloc_priv_msg() is used when sending a REQ, REP, DREQ, and SIDR REQ, all of which require a response. Also, merge common code into cm_alloc_priv_msg() and combine the freeing of all messages which do not need a response. Signed-off-by: Sean Hefty Signed-off-by: Or Har-Toov Signed-off-by: Vlad Dumitrescu Link: https://patch.msgid.link/1f0f96acace72790ecf89087fc765dead960189e.1731495873.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 66 +++++++++++++----------------------- 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 99246e49dd3a..2517bfebcfd5 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -309,12 +309,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) goto out; } - /* Timeout set by caller if response is expected. */ m->ah = ah; - m->retries = cm_id_priv->max_cm_retries; - - refcount_inc(&cm_id_priv->refcount); - m->context[0] = cm_id_priv; out: spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); @@ -323,16 +318,13 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) static void cm_free_msg(struct ib_mad_send_buf *msg) { - struct cm_id_private *cm_id_priv = msg->context[0]; - if (msg->ah) rdma_destroy_ah(msg->ah, 0); - cm_deref_id(cm_id_priv); ib_free_send_mad(msg); } static struct ib_mad_send_buf * -cm_alloc_priv_msg(struct cm_id_private *cm_id_priv) +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) { struct ib_mad_send_buf *msg; @@ -341,7 +333,15 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv) msg = cm_alloc_msg(cm_id_priv); if (IS_ERR(msg)) return msg; + cm_id_priv->msg = msg; + refcount_inc(&cm_id_priv->refcount); + msg->context[0] = cm_id_priv; + msg->context[1] = (void *) (unsigned long) state; + + msg->retries = cm_id_priv->max_cm_retries; + msg->timeout_ms = cm_id_priv->timeout_ms; + return msg; } @@ -413,13 +413,6 @@ static int cm_alloc_response_msg(struct cm_port *port, return 0; } -static void cm_free_response_msg(struct ib_mad_send_buf *msg) -{ - if (msg->ah) - rdma_destroy_ah(msg->ah, 0); - ib_free_send_mad(msg); -} - static void *cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; @@ -1567,7 +1560,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, if (param->alternate_path) cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REQ_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out_unlock; @@ -1576,8 +1569,6 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, req_msg = (struct cm_req_msg *)msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT; cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); @@ -1634,7 +1625,7 @@ static int cm_issue_rej(struct cm_port *port, IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) - cm_free_response_msg(msg); + cm_free_msg(msg); return ret; } @@ -1990,7 +1981,7 @@ static void cm_dup_req_handler(struct cm_work *work, return; unlock: spin_unlock_irq(&cm_id_priv->lock); -free: cm_free_response_msg(msg); +free: cm_free_msg(msg); } static struct cm_id_private *cm_match_req(struct cm_work *work, @@ -2304,7 +2295,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, goto out; } - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out; @@ -2312,8 +2303,6 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; trace_icm_send_rep(cm_id); ret = ib_post_send_mad(msg, NULL); @@ -2479,7 +2468,7 @@ static void cm_dup_rep_handler(struct cm_work *work) goto deref; unlock: spin_unlock_irq(&cm_id_priv->lock); -free: cm_free_response_msg(msg); +free: cm_free_msg(msg); deref: cm_deref_id(cm_id_priv); } @@ -2683,7 +2672,7 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) ib_cancel_mad(cm_id_priv->msg); - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT); if (IS_ERR(msg)) { cm_enter_timewait(cm_id_priv); return PTR_ERR(msg); @@ -2691,8 +2680,6 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, private_data, private_data_len); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); @@ -2819,7 +2806,7 @@ static int cm_issue_drep(struct cm_port *port, IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) - cm_free_response_msg(msg); + cm_free_msg(msg); return ret; } @@ -2878,7 +2865,7 @@ static int cm_dreq_handler(struct cm_work *work) if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) - cm_free_response_msg(msg); + cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] @@ -3386,7 +3373,7 @@ static int cm_lap_handler(struct cm_work *work) if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || ib_post_send_mad(msg, NULL)) - cm_free_response_msg(msg); + cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] @@ -3525,7 +3512,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, goto out_unlock; } - msg = cm_alloc_priv_msg(cm_id_priv); + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_SIDR_REQ_SENT); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out_unlock; @@ -3533,8 +3520,6 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv, param); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT; trace_icm_send_sidr_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); @@ -3780,17 +3765,17 @@ static int cm_sidr_rep_handler(struct cm_work *work) static void cm_process_send_error(struct cm_id_private *cm_id_priv, struct ib_mad_send_buf *msg, - enum ib_cm_state state, enum ib_wc_status wc_status) { + enum ib_cm_state state = (unsigned long) msg->context[1]; struct ib_cm_event cm_event = {}; int ret; - /* Discard old sends or ones without a response. */ + /* Discard old sends. */ spin_lock_irq(&cm_id_priv->lock); if (msg != cm_id_priv->msg) { spin_unlock_irq(&cm_id_priv->lock); - cm_free_msg(msg); + cm_free_priv_msg(msg); return; } cm_free_priv_msg(msg); @@ -3839,8 +3824,6 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; struct cm_id_private *cm_id_priv; - enum ib_cm_state state = - (enum ib_cm_state)(unsigned long)msg->context[1]; struct cm_port *port; u16 attr_index; @@ -3861,10 +3844,9 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, &port->counters[CM_XMIT_RETRIES][attr_index]); if (cm_id_priv) - cm_process_send_error(cm_id_priv, msg, state, - mad_send_wc->status); + cm_process_send_error(cm_id_priv, msg, mad_send_wc->status); else - cm_free_response_msg(msg); + cm_free_msg(msg); } static void cm_work_handler(struct work_struct *_work) From fc0856c3a32576fb21c494f38b9c6c8dc3bf58ab Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 13 Nov 2024 13:12:56 +0200 Subject: [PATCH 48/51] IB/cm: Rework sending DREQ when destroying a cm_id A DREQ is sent in 2 situations: 1. When requested by the user. This DREQ has to wait for a DREP, which will be routed to the user. 2. When the cm_id is destroyed. This DREQ is generated by the CM to notify the peer that the connection has been destroyed. In the latter case, any DREP that is received will be discarded. There's no need to hold a reference on the cm_id. Today, both situations are covered by the same function: cm_send_dreq_locked(). When invoked in the cm_id destroy path, the cm_id reference would be held until the DREQ completes, blocking the destruction. Because it could take several seconds to minutes before the DREQ receives a DREP, the destroy call posts a send for the DREQ then immediately cancels the MAD. However, cancellation is not immediate in the MAD layer. There could still be a delay before the MAD layer returns the DREQ to the CM. Moreover, the only guarantee is that the DREQ will be sent at most once. Introduce a separate flow for sending a DREQ when destroying the cm_id. The new flow will not hold a reference on the cm_id, allowing it to be cleaned up immediately. The cancellation trick is no longer needed. The MAD layer will send the DREQ exactly once. Signed-off-by: Sean Hefty Signed-off-by: Or Har-Toov Signed-off-by: Vlad Dumitrescu Link: https://patch.msgid.link/a288a098b8e0550305755fd4a7937431699317f4.1731495873.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cm.c | 53 ++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2517bfebcfd5..142170473e75 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -95,8 +95,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, struct cm_work *work); static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param); -static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, - const void *private_data, u8 private_data_len); +static void cm_issue_dreq(struct cm_id_private *cm_id_priv); static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, void *private_data, u8 private_data_len); static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, @@ -1112,7 +1111,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) cm_id->state = IB_CM_IDLE; break; } - cm_send_dreq_locked(cm_id_priv, NULL, 0); + cm_issue_dreq(cm_id_priv); + cm_enter_timewait(cm_id_priv); goto retest; case IB_CM_DREQ_SENT: ib_cancel_mad(cm_id_priv->msg); @@ -2652,20 +2652,42 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, private_data_len); } -static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, - const void *private_data, u8 private_data_len) +static void cm_issue_dreq(struct cm_id_private *cm_id_priv) { struct ib_mad_send_buf *msg; int ret; lockdep_assert_held(&cm_id_priv->lock); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return; + + cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, NULL, 0); + + trace_icm_send_dreq(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_msg(msg); +} + +int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; + spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { trace_icm_dreq_skipped(&cm_id_priv->id); - return -EINVAL; + ret = -EINVAL; + goto unlock; } if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || @@ -2675,7 +2697,8 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT); if (IS_ERR(msg)) { cm_enter_timewait(cm_id_priv); - return PTR_ERR(msg); + ret = PTR_ERR(msg); + goto unlock; } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, @@ -2686,23 +2709,11 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, if (ret) { cm_enter_timewait(cm_id_priv); cm_free_priv_msg(msg); - return ret; + goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_SENT; - return 0; -} - -int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, - u8 private_data_len) -{ - struct cm_id_private *cm_id_priv = - container_of(cm_id, struct cm_id_private, id); - unsigned long flags; - int ret; - - spin_lock_irqsave(&cm_id_priv->lock, flags); - ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len); +unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } From c64b16a37b6d240f6baefd7061970a01c23da61d Mon Sep 17 00:00:00 2001 From: Chandramohan Akula Date: Fri, 15 Nov 2024 00:47:42 -0800 Subject: [PATCH 49/51] RDMA/bnxt_re: Support different traffic class Adding support for different traffic class passed to driver. Fix the traffic class setting in modify_qp by skipping the ECN bits. Pass the service level received from applications to the firmware. Signed-off-by: Chandramohan Akula Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1731660464-27838-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index f6e9eefc95d6..481261f8d7ed 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2136,7 +2136,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp->qplib_qp.ah.sgid_index = ctx->idx; qp->qplib_qp.ah.host_sgid_index = grh->sgid_index; qp->qplib_qp.ah.hop_limit = grh->hop_limit; - qp->qplib_qp.ah.traffic_class = grh->traffic_class; + qp->qplib_qp.ah.traffic_class = grh->traffic_class >> 2; qp->qplib_qp.ah.sl = rdma_ah_get_sl(&qp_attr->ah_attr); ether_addr_copy(qp->qplib_qp.ah.dmac, qp_attr->ah_attr.roce.dmac); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e56f42fddefe..256c4379ab7f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1318,6 +1318,7 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) struct creq_modify_qp_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_modify_qp req = {}; + u16 vlan_pcp_vlan_dei_vlan_id; u32 temp32[4]; u32 bmask; int rc; @@ -1414,7 +1415,16 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID) req.dest_qp_id = cpu_to_le32(qp->dest_qpn); - req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(qp->vlan_id); + if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID) { + vlan_pcp_vlan_dei_vlan_id = + ((res->sgid_tbl.tbl[qp->ah.sgid_index].vlan_id << + CMDQ_MODIFY_QP_VLAN_ID_SFT) & + CMDQ_MODIFY_QP_VLAN_ID_MASK); + vlan_pcp_vlan_dei_vlan_id |= + ((qp->ah.sl << CMDQ_MODIFY_QP_VLAN_PCP_SFT) & + CMDQ_MODIFY_QP_VLAN_PCP_MASK); + req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(vlan_pcp_vlan_dei_vlan_id); + } bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); From bfb27ae6d0f9a1229ab15c2f13616e96b4a4419e Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Fri, 15 Nov 2024 00:47:43 -0800 Subject: [PATCH 50/51] RDMA/bnxt_re: Use the default mode of congestion control Instead of driver setting the congestion mode, use the default values setup by Firmware. Enable the tos_ecn field in FW. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1731660464-27838-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 533b9f110d24..ac475a596f03 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2204,11 +2204,10 @@ static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) if (enable) { cc_param.enable = 1; - cc_param.cc_mode = CMDQ_MODIFY_ROCE_CC_CC_MODE_PROBABILISTIC_CC_MODE; + cc_param.tos_ecn = 1; } - cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE | - CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | + cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN); if (bnxt_qplib_modify_cc(&rdev->qplib_res, &cc_param)) From 68b3bca2df00f0a63f0aa2db2b2adc795665229e Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Fri, 15 Nov 2024 00:47:44 -0800 Subject: [PATCH 51/51] RDMA/bnxt_re: Correct the sequence of device suspend When in fatal error condition, mark device as detached first and then complete all pending HWRM commands as firmware is not going to process them and eventually time out. Move the device to error only if suspend is called when device is in Fatal state. Also, remove some outdated comments. Remove the stop_irq call which is no longer required. Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1731660464-27838-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index ac475a596f03..298c848f3a4d 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2347,12 +2347,6 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) rdev = en_info->rdev; en_dev = en_info->en_dev; mutex_lock(&bnxt_re_mutex); - /* L2 driver may invoke this callback during device error/crash or device - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ ibdev_info(&rdev->ibdev, "Handle device suspend call"); /* Check the current device state from bnxt_en_dev and move the @@ -2360,17 +2354,12 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) * This prevents more commands to HW during clean-up, * in case the device is already in error. */ - if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) + if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) { set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); - - bnxt_re_dev_stop(rdev); - bnxt_re_stop_irq(adev); - /* Move the device states to detached and avoid sending any more - * commands to HW - */ - set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); - set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); - wake_up_all(&rdev->rcfw.cmdq.waitq); + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); + bnxt_re_dev_stop(rdev); + } if (rdev->pacing.dbr_pacing) bnxt_re_set_pacing_dev_state(rdev); @@ -2392,13 +2381,6 @@ static int bnxt_re_resume(struct auxiliary_device *adev) return 0; mutex_lock(&bnxt_re_mutex); - /* L2 driver may invoke this callback during device recovery, resume. - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ - bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT); rdev = en_info->rdev; ibdev_info(&rdev->ibdev, "Device resume completed");