From 91018f8632e09e3a617c9fc2efbbdaa2922d2fe7 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Sat, 25 Feb 2012 17:45:02 -0800 Subject: [PATCH 01/15] RDMA/cxgb4: Add missing peer2peer check in MPAv2 code Don't worry about p2p_type if peer2peer itself is not requested in the first place. Signed-off-by: Kumar Sanghvi Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0668bb3472d0..006a35372b7a 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1114,7 +1114,7 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) * generated when moving QP to RTS state. * A TERM message will be sent after QP has moved to RTS state */ - if ((ep->mpa_attr.version == 2) && + if ((ep->mpa_attr.version == 2) && peer2peer && (ep->mpa_attr.p2p_type != p2p_type)) { ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED; rtr_mismatch = 1; From 8dd87fba939370e729b0ee72c163f279d310de06 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Sat, 25 Feb 2012 17:45:37 -0800 Subject: [PATCH 02/15] RDMA/nes: Fixes for sparse endianness warnings Fix endianness problems detect by sparse, introduced with the enhanced MPA patch. Reported-by: Dan Carpenter Signed-off-by: Tatyana Nikolova Signed-off-by: Faisal Latif Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_cm.c | 39 +++++++++++++++++------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index a4972abedef1..da2c67db5ebb 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -338,18 +338,21 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, case IETF_MPA_V2: { u16 ird_size; u16 ord_size; + u16 rtr_ctrl_ird; + u16 rtr_ctrl_ord; + mpa_v2_frame = (struct ietf_mpa_v2 *)buffer; mpa_hdr_len += IETF_RTR_MSG_SIZE; cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE; rtr_msg = &mpa_v2_frame->rtr_msg; /* parse rtr message */ - rtr_msg->ctrl_ird = ntohs(rtr_msg->ctrl_ird); - rtr_msg->ctrl_ord = ntohs(rtr_msg->ctrl_ord); - ird_size = rtr_msg->ctrl_ird & IETF_NO_IRD_ORD; - ord_size = rtr_msg->ctrl_ord & IETF_NO_IRD_ORD; + rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird); + rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord); + ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD; + ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD; - if (!(rtr_msg->ctrl_ird & IETF_PEER_TO_PEER)) { + if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) { /* send reset */ return -EINVAL; } @@ -370,9 +373,9 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type, } } - if (rtr_msg->ctrl_ord & IETF_RDMA0_READ) { + if (rtr_ctrl_ord & IETF_RDMA0_READ) { cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; - } else if (rtr_msg->ctrl_ord & IETF_RDMA0_WRITE) { + } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) { cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO; } else { /* Not supported RDMA0 operation */ return -EINVAL; @@ -543,6 +546,8 @@ static void build_mpa_v2(struct nes_cm_node *cm_node, { struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; + u16 ctrl_ird; + u16 ctrl_ord; /* initialize the upper 5 bytes of the frame */ build_mpa_v1(cm_node, start_addr, mpa_key); @@ -550,31 +555,31 @@ static void build_mpa_v2(struct nes_cm_node *cm_node, mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE); /* initialize RTR msg */ - rtr_msg->ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? + ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? IETF_NO_IRD_ORD : cm_node->ird_size; - rtr_msg->ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? + ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? IETF_NO_IRD_ORD : cm_node->ord_size; - rtr_msg->ctrl_ird |= IETF_PEER_TO_PEER; - rtr_msg->ctrl_ird |= IETF_FLPDU_ZERO_LEN; + ctrl_ird |= IETF_PEER_TO_PEER; + ctrl_ird |= IETF_FLPDU_ZERO_LEN; switch (mpa_key) { case MPA_KEY_REQUEST: - rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; - rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_READ; break; case MPA_KEY_REPLY: switch (cm_node->send_rdma0_op) { case SEND_RDMA_WRITE_ZERO: - rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_WRITE; break; case SEND_RDMA_READ_ZERO: - rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + ctrl_ord |= IETF_RDMA0_READ; break; } } - rtr_msg->ctrl_ird = htons(rtr_msg->ctrl_ird); - rtr_msg->ctrl_ord = htons(rtr_msg->ctrl_ord); + rtr_msg->ctrl_ird = htons(ctrl_ird); + rtr_msg->ctrl_ord = htons(ctrl_ord); } /** From a778f3fddc6fc2ed4c065f6e160d517a5959f949 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 25 Feb 2012 17:45:49 -0800 Subject: [PATCH 03/15] IB/qib: Add logic for affinity hint Call irq_set_affinity_hint() to give userspace programs such as irqbalance the information to be able to distribute qib interrupts appropriately. The logic allocates all non-receive interrupts to the first CPU local to the HCA. Receive interrupts are allocated round robin starting with the second CPU local to the HCA with potential wrap back to the second CPU. This patch also adds a refinement to the name registered for MSI-X interrupts so that user level scripts can determine the device associated with the IRQs when there are multiple HCAs with a potentially different set of local CPUs. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib.h | 10 ++- drivers/infiniband/hw/qib/qib_iba7322.c | 107 +++++++++++++++++------- drivers/infiniband/hw/qib/qib_pcie.c | 21 ++++- 3 files changed, 104 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index b881bdc401f5..6b811e3e8bd1 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -427,6 +427,14 @@ struct qib_verbs_txreq { /* how often we check for packet activity for "power on hours (in seconds) */ #define ACTIVITY_TIMER 5 +#define MAX_NAME_SIZE 64 +struct qib_msix_entry { + struct msix_entry msix; + void *arg; + char name[MAX_NAME_SIZE]; + cpumask_var_t mask; +}; + /* Below is an opaque struct. Each chip (device) can maintain * private data needed for its operation, but not germane to the * rest of the driver. For convenience, we define another that @@ -1355,7 +1363,7 @@ int qib_pcie_init(struct pci_dev *, const struct pci_device_id *); int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *, const struct pci_device_id *); void qib_pcie_ddcleanup(struct qib_devdata *); -int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct msix_entry *); +int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct qib_msix_entry *); int qib_reinit_intr(struct qib_devdata *); void qib_enable_intx(struct pci_dev *); void qib_nomsi(struct qib_devdata *); diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 41e92089e41b..060b96064469 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -541,8 +541,7 @@ struct qib_chip_specific { u32 lastbuf_for_pio; u32 stay_in_freeze; u32 recovery_ports_initted; - struct msix_entry *msix_entries; - void **msix_arg; + struct qib_msix_entry *msix_entries; unsigned long *sendchkenable; unsigned long *sendgrhchk; unsigned long *sendibchk; @@ -639,24 +638,24 @@ static struct { int lsb; int port; /* 0 if not port-specific, else port # */ } irq_table[] = { - { QIB_DRV_NAME, qib_7322intr, -1, 0 }, - { QIB_DRV_NAME " (buf avail)", qib_7322bufavail, + { "", qib_7322intr, -1, 0 }, + { " (buf avail)", qib_7322bufavail, SYM_LSB(IntStatus, SendBufAvail), 0 }, - { QIB_DRV_NAME " (sdma 0)", sdma_intr, + { " (sdma 0)", sdma_intr, SYM_LSB(IntStatus, SDmaInt_0), 1 }, - { QIB_DRV_NAME " (sdma 1)", sdma_intr, + { " (sdma 1)", sdma_intr, SYM_LSB(IntStatus, SDmaInt_1), 2 }, - { QIB_DRV_NAME " (sdmaI 0)", sdma_idle_intr, + { " (sdmaI 0)", sdma_idle_intr, SYM_LSB(IntStatus, SDmaIdleInt_0), 1 }, - { QIB_DRV_NAME " (sdmaI 1)", sdma_idle_intr, + { " (sdmaI 1)", sdma_idle_intr, SYM_LSB(IntStatus, SDmaIdleInt_1), 2 }, - { QIB_DRV_NAME " (sdmaP 0)", sdma_progress_intr, + { " (sdmaP 0)", sdma_progress_intr, SYM_LSB(IntStatus, SDmaProgressInt_0), 1 }, - { QIB_DRV_NAME " (sdmaP 1)", sdma_progress_intr, + { " (sdmaP 1)", sdma_progress_intr, SYM_LSB(IntStatus, SDmaProgressInt_1), 2 }, - { QIB_DRV_NAME " (sdmaC 0)", sdma_cleanup_intr, + { " (sdmaC 0)", sdma_cleanup_intr, SYM_LSB(IntStatus, SDmaCleanupDone_0), 1 }, - { QIB_DRV_NAME " (sdmaC 1)", sdma_cleanup_intr, + { " (sdmaC 1)", sdma_cleanup_intr, SYM_LSB(IntStatus, SDmaCleanupDone_1), 2 }, }; @@ -2567,9 +2566,13 @@ static void qib_7322_nomsix(struct qib_devdata *dd) int i; dd->cspec->num_msix_entries = 0; - for (i = 0; i < n; i++) - free_irq(dd->cspec->msix_entries[i].vector, - dd->cspec->msix_arg[i]); + for (i = 0; i < n; i++) { + irq_set_affinity_hint( + dd->cspec->msix_entries[i].msix.vector, NULL); + free_cpumask_var(dd->cspec->msix_entries[i].mask); + free_irq(dd->cspec->msix_entries[i].msix.vector, + dd->cspec->msix_entries[i].arg); + } qib_nomsix(dd); } /* make sure no MSIx interrupts are left pending */ @@ -2597,7 +2600,6 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd) kfree(dd->cspec->sendgrhchk); kfree(dd->cspec->sendibchk); kfree(dd->cspec->msix_entries); - kfree(dd->cspec->msix_arg); for (i = 0; i < dd->num_pports; i++) { unsigned long flags; u32 mask = QSFP_GPIO_MOD_PRS_N | @@ -3070,6 +3072,8 @@ static void qib_setup_7322_interrupt(struct qib_devdata *dd, int clearpend) int ret, i, msixnum; u64 redirect[6]; u64 mask; + const struct cpumask *local_mask; + int firstcpu, secondcpu = 0, currrcvcpu = 0; if (!dd->num_pports) return; @@ -3118,13 +3122,28 @@ try_intx: memset(redirect, 0, sizeof redirect); mask = ~0ULL; msixnum = 0; + local_mask = cpumask_of_pcibus(dd->pcidev->bus); + firstcpu = cpumask_first(local_mask); + if (firstcpu >= nr_cpu_ids || + cpumask_weight(local_mask) == num_online_cpus()) { + local_mask = topology_core_cpumask(0); + firstcpu = cpumask_first(local_mask); + } + if (firstcpu < nr_cpu_ids) { + secondcpu = cpumask_next(firstcpu, local_mask); + if (secondcpu >= nr_cpu_ids) + secondcpu = firstcpu; + currrcvcpu = secondcpu; + } for (i = 0; msixnum < dd->cspec->num_msix_entries; i++) { irq_handler_t handler; - const char *name; void *arg; u64 val; int lsb, reg, sh; + dd->cspec->msix_entries[msixnum]. + name[sizeof(dd->cspec->msix_entries[msixnum].name) - 1] + = '\0'; if (i < ARRAY_SIZE(irq_table)) { if (irq_table[i].port) { /* skip if for a non-configured port */ @@ -3135,7 +3154,11 @@ try_intx: arg = dd; lsb = irq_table[i].lsb; handler = irq_table[i].handler; - name = irq_table[i].name; + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, + QIB_DRV_NAME "%d%s", dd->unit, + irq_table[i].name); } else { unsigned ctxt; @@ -3148,23 +3171,28 @@ try_intx: continue; lsb = QIB_I_RCVAVAIL_LSB + ctxt; handler = qib_7322pintr; - name = QIB_DRV_NAME " (kctx)"; + snprintf(dd->cspec->msix_entries[msixnum].name, + sizeof(dd->cspec->msix_entries[msixnum].name) + - 1, + QIB_DRV_NAME "%d (kctx)", dd->unit); } - ret = request_irq(dd->cspec->msix_entries[msixnum].vector, - handler, 0, name, arg); + ret = request_irq( + dd->cspec->msix_entries[msixnum].msix.vector, + handler, 0, dd->cspec->msix_entries[msixnum].name, + arg); if (ret) { /* * Shouldn't happen since the enable said we could * have as many as we are trying to setup here. */ qib_dev_err(dd, "Couldn't setup MSIx " - "interrupt (vec=%d, irq=%d): %d\n", msixnum, - dd->cspec->msix_entries[msixnum].vector, - ret); + "interrupt (vec=%d, irq=%d): %d\n", msixnum, + dd->cspec->msix_entries[msixnum].msix.vector, + ret); qib_7322_nomsix(dd); goto try_intx; } - dd->cspec->msix_arg[msixnum] = arg; + dd->cspec->msix_entries[msixnum].arg = arg; if (lsb >= 0) { reg = lsb / IBA7322_REDIRECT_VEC_PER_REG; sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) * @@ -3174,6 +3202,25 @@ try_intx: } val = qib_read_kreg64(dd, 2 * msixnum + 1 + (QIB_7322_MsixTable_OFFS / sizeof(u64))); + if (firstcpu < nr_cpu_ids && + zalloc_cpumask_var( + &dd->cspec->msix_entries[msixnum].mask, + GFP_KERNEL)) { + if (handler == qib_7322pintr) { + cpumask_set_cpu(currrcvcpu, + dd->cspec->msix_entries[msixnum].mask); + currrcvcpu = cpumask_next(currrcvcpu, + local_mask); + if (currrcvcpu >= nr_cpu_ids) + currrcvcpu = secondcpu; + } else { + cpumask_set_cpu(firstcpu, + dd->cspec->msix_entries[msixnum].mask); + } + irq_set_affinity_hint( + dd->cspec->msix_entries[msixnum].msix.vector, + dd->cspec->msix_entries[msixnum].mask); + } msixnum++; } /* Initialize the vector mapping */ @@ -3365,7 +3412,7 @@ static int qib_do_7322_reset(struct qib_devdata *dd) if (msix_entries) { /* restore the MSIx vector address and data if saved above */ for (i = 0; i < msix_entries; i++) { - dd->cspec->msix_entries[i].entry = i; + dd->cspec->msix_entries[i].msix.entry = i; if (!msix_vecsave || !msix_vecsave[2 * i]) continue; qib_write_kreg(dd, 2 * i + @@ -6865,15 +6912,13 @@ struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev, tabsize = actual_cnt; dd->cspec->msix_entries = kmalloc(tabsize * - sizeof(struct msix_entry), GFP_KERNEL); - dd->cspec->msix_arg = kmalloc(tabsize * - sizeof(void *), GFP_KERNEL); - if (!dd->cspec->msix_entries || !dd->cspec->msix_arg) { + sizeof(struct qib_msix_entry), GFP_KERNEL); + if (!dd->cspec->msix_entries) { qib_dev_err(dd, "No memory for MSIx table\n"); tabsize = 0; } for (i = 0; i < tabsize; i++) - dd->cspec->msix_entries[i].entry = i; + dd->cspec->msix_entries[i].msix.entry = i; if (qib_pcie_params(dd, 8, &tabsize, dd->cspec->msix_entries)) qib_dev_err(dd, "Failed to setup PCIe or interrupts; " diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 0fde788e1100..790646ef5106 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -194,11 +194,24 @@ void qib_pcie_ddcleanup(struct qib_devdata *dd) } static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, - struct msix_entry *msix_entry) + struct qib_msix_entry *qib_msix_entry) { int ret; u32 tabsize = 0; u16 msix_flags; + struct msix_entry *msix_entry; + int i; + + /* We can't pass qib_msix_entry array to qib_msix_setup + * so use a dummy msix_entry array and copy the allocated + * irq back to the qib_msix_entry array. */ + msix_entry = kmalloc(*msixcnt * sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) { + ret = -ENOMEM; + goto do_intx; + } + for (i = 0; i < *msixcnt; i++) + msix_entry[i] = qib_msix_entry[i].msix; pci_read_config_word(dd->pcidev, pos + PCI_MSIX_FLAGS, &msix_flags); tabsize = 1 + (msix_flags & PCI_MSIX_FLAGS_QSIZE); @@ -209,11 +222,15 @@ static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt, tabsize = ret; ret = pci_enable_msix(dd->pcidev, msix_entry, tabsize); } +do_intx: if (ret) { qib_dev_err(dd, "pci_enable_msix %d vectors failed: %d, " "falling back to INTx\n", tabsize, ret); tabsize = 0; } + for (i = 0; i < tabsize; i++) + qib_msix_entry[i].msix = msix_entry[i]; + kfree(msix_entry); *msixcnt = tabsize; if (ret) @@ -251,7 +268,7 @@ static int qib_msi_setup(struct qib_devdata *dd, int pos) } int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent, - struct msix_entry *entry) + struct qib_msix_entry *entry) { u16 linkstat, speed; int pos = 0, pose, ret = 1; From 520b3ee70527cb47f0b08ceb25ace02aed71eab7 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Sat, 25 Feb 2012 17:45:50 -0800 Subject: [PATCH 04/15] IB/qib: Avoid filtering LID on SMA portinfo The current get portinfo handling filters the LID being sent, changing zero to 0xffff. This causes OpenSM to log excessive warning messages. Reviewed-by: Edward Mascarenhas Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_mad.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 3b3745f261f0..c4ff788823b5 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -433,7 +433,6 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, struct qib_pportdata *ppd; struct qib_ibport *ibp; struct ib_port_info *pip = (struct ib_port_info *)smp->data; - u16 lid; u8 mtu; int ret; u32 state; @@ -469,8 +468,7 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, ibp->mkeyprot == 1)) pip->mkey = ibp->mkey; pip->gid_prefix = ibp->gid_prefix; - lid = ppd->lid; - pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE; + pip->lid = cpu_to_be16(ppd->lid); pip->sm_lid = cpu_to_be16(ibp->sm_lid); pip->cap_mask = cpu_to_be32(ibp->port_cap_flags); /* pip->diag_code; */ From 6aeaa48b0dd3a4261804be1cccaea46d82be3fcb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 25 Feb 2012 17:47:21 -0800 Subject: [PATCH 05/15] IB/ehca: Use kthread_create_on_node() Since create_comp_task() creates percpu kthread, it makes sense to use kthread_create_on_node() to get proper NUMA affinity for kthread stack. Signed-off-by: Eric Dumazet Acked-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index e571e60ecb88..53589000fd07 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -786,7 +786,8 @@ static struct task_struct *create_comp_task(struct ehca_comp_pool *pool, spin_lock_init(&cct->task_lock); INIT_LIST_HEAD(&cct->cq_list); init_waitqueue_head(&cct->wait_queue); - cct->task = kthread_create(comp_task, cct, "ehca_comp/%d", cpu); + cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu), + "ehca_comp/%d", cpu); return cct->task; } From d144b650c635b941c3d73ef995ec16984594157f Mon Sep 17 00:00:00 2001 From: Swapna Thete Date: Sat, 25 Feb 2012 17:47:31 -0800 Subject: [PATCH 06/15] IB/mad: Add MAD error codes from IBA spec Add defines for MAD error codes so that they can be used when returning error responses. Signed-off-by: Swapna Thete Signed-off-by: Roland Dreier --- include/rdma/ib_mad.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index d3b9401b77b0..b513f57e1725 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -77,6 +77,15 @@ #define IB_MGMT_MAX_METHODS 128 +/* MAD Status field bit masks */ +#define IB_MGMT_MAD_STATUS_SUCCESS 0x0000 +#define IB_MGMT_MAD_STATUS_BUSY 0x0001 +#define IB_MGMT_MAD_STATUS_REDIRECT_REQD 0x0002 +#define IB_MGMT_MAD_STATUS_BAD_VERSION 0x0004 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD 0x0008 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB 0x000c +#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE 0x001c + /* RMPP information */ #define IB_MGMT_RMPP_VERSION 1 From 0b307043049f34211affdde46f82e7abbe8c4590 Mon Sep 17 00:00:00 2001 From: Swapna Thete Date: Sat, 25 Feb 2012 17:47:32 -0800 Subject: [PATCH 07/15] IB/mad: Return error response for unsupported MADs Set up a response with appropriate error status and send it for MADs that are not supported by a specific class/version. Reviewed-by: Hal Rosenstock Signed-off-by: Swapna Thete Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 2fe428bba54c..426bb7617ec6 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1842,6 +1842,24 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } } +static bool generate_unmatched_resp(struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || + recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, sizeof *response); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + response->mad.mad.mad_hdr.status = + cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + + return true; + } else { + return false; + } +} static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { @@ -1963,6 +1981,9 @@ local: * or via recv_handler in ib_mad_complete_recv() */ recv = NULL; + } else if (generate_unmatched_resp(recv, response)) { + agent_send_response(&response->mad.mad, &recv->grh, wc, + port_priv->device, port_num, qp_info->qp->qp_num); } out: From a776ce7cfc905a1f8ebf9f7e87f0ba705e7efaef Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 9 Feb 2012 23:37:43 +0900 Subject: [PATCH 08/15] IB/srpt: Fix typo "alocate" -> "allocate" Signed-off-by: Masanari Iida Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 2b73d43cd691..ebe33d960d77 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -3450,7 +3450,7 @@ static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg) nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL); if (!nacl) { - printk(KERN_ERR "Unable to alocate struct srpt_node_acl\n"); + printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n"); return NULL; } From e0bda7d8c33e60fb08cc0b5522cd86346313722c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 14 Jan 2012 12:39:44 +0000 Subject: [PATCH 09/15] IB/srp: Use pr_fmt() and pr_err()/pr_warn() Use pr_fmt() and pr_xxx() instead of more verbose printk() equivalents. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 54 ++++++++++++++++------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 0bfa545675b8..895a9ff17379 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -30,6 +30,8 @@ * SOFTWARE. */ +#define pr_fmt(fmt) PFX fmt + #include #include #include @@ -165,7 +167,7 @@ static void srp_free_iu(struct srp_host *host, struct srp_iu *iu) static void srp_qp_event(struct ib_event *event, void *context) { - printk(KERN_ERR PFX "QP event %d\n", event->event); + pr_debug("QP event %d\n", event->event); } static int srp_init_qp(struct srp_target_port *target, @@ -1989,7 +1991,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) goto out; } if (strlen(p) != 32) { - printk(KERN_WARNING PFX "bad dest GID parameter '%s'\n", p); + pr_warn("bad dest GID parameter '%s'\n", p); kfree(p); goto out; } @@ -2004,7 +2006,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_PKEY: if (match_hex(args, &token)) { - printk(KERN_WARNING PFX "bad P_Key parameter '%s'\n", p); + pr_warn("bad P_Key parameter '%s'\n", p); goto out; } target->path.pkey = cpu_to_be16(token); @@ -2023,7 +2025,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_MAX_SECT: if (match_int(args, &token)) { - printk(KERN_WARNING PFX "bad max sect parameter '%s'\n", p); + pr_warn("bad max sect parameter '%s'\n", p); goto out; } target->scsi_host->max_sectors = token; @@ -2031,7 +2033,8 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_MAX_CMD_PER_LUN: if (match_int(args, &token)) { - printk(KERN_WARNING PFX "bad max cmd_per_lun parameter '%s'\n", p); + pr_warn("bad max cmd_per_lun parameter '%s'\n", + p); goto out; } target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); @@ -2039,14 +2042,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_IO_CLASS: if (match_hex(args, &token)) { - printk(KERN_WARNING PFX "bad IO class parameter '%s' \n", p); + pr_warn("bad IO class parameter '%s'\n", p); goto out; } if (token != SRP_REV10_IB_IO_CLASS && token != SRP_REV16A_IB_IO_CLASS) { - printk(KERN_WARNING PFX "unknown IO class parameter value" - " %x specified (use %x or %x).\n", - token, SRP_REV10_IB_IO_CLASS, SRP_REV16A_IB_IO_CLASS); + pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", + token, SRP_REV10_IB_IO_CLASS, + SRP_REV16A_IB_IO_CLASS); goto out; } target->io_class = token; @@ -2064,7 +2067,8 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_CMD_SG_ENTRIES: if (match_int(args, &token) || token < 1 || token > 255) { - printk(KERN_WARNING PFX "bad max cmd_sg_entries parameter '%s'\n", p); + pr_warn("bad max cmd_sg_entries parameter '%s'\n", + p); goto out; } target->cmd_sg_cnt = token; @@ -2072,7 +2076,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_ALLOW_EXT_SG: if (match_int(args, &token)) { - printk(KERN_WARNING PFX "bad allow_ext_sg parameter '%s'\n", p); + pr_warn("bad allow_ext_sg parameter '%s'\n", p); goto out; } target->allow_ext_sg = !!token; @@ -2081,15 +2085,16 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_SG_TABLESIZE: if (match_int(args, &token) || token < 1 || token > SCSI_MAX_SG_CHAIN_SEGMENTS) { - printk(KERN_WARNING PFX "bad max sg_tablesize parameter '%s'\n", p); + pr_warn("bad max sg_tablesize parameter '%s'\n", + p); goto out; } target->sg_tablesize = token; break; default: - printk(KERN_WARNING PFX "unknown parameter or missing value " - "'%s' in target creation request\n", p); + pr_warn("unknown parameter or missing value '%s' in target creation request\n", + p); goto out; } } @@ -2100,9 +2105,8 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i) if ((srp_opt_tokens[i].token & SRP_OPT_ALL) && !(srp_opt_tokens[i].token & opt_mask)) - printk(KERN_WARNING PFX "target creation request is " - "missing parameter '%s'\n", - srp_opt_tokens[i].pattern); + pr_warn("target creation request is missing parameter '%s'\n", + srp_opt_tokens[i].pattern); out: kfree(options); @@ -2149,7 +2153,7 @@ static ssize_t srp_create_target(struct device *dev, if (!host->srp_dev->fmr_pool && !target->allow_ext_sg && target->cmd_sg_cnt < target->sg_tablesize) { - printk(KERN_WARNING PFX "No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + pr_warn("No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } @@ -2309,8 +2313,7 @@ static void srp_add_one(struct ib_device *device) return; if (ib_query_device(device, dev_attr)) { - printk(KERN_WARNING PFX "Query device failed for %s\n", - device->name); + pr_warn("Query device failed for %s\n", device->name); goto free_attr; } @@ -2459,7 +2462,7 @@ static int __init srp_init_module(void) BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); if (srp_sg_tablesize) { - printk(KERN_WARNING PFX "srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); + pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); if (!cmd_sg_entries) cmd_sg_entries = srp_sg_tablesize; } @@ -2468,14 +2471,15 @@ static int __init srp_init_module(void) cmd_sg_entries = SRP_DEF_SG_TABLESIZE; if (cmd_sg_entries > 255) { - printk(KERN_WARNING PFX "Clamping cmd_sg_entries to 255\n"); + pr_warn("Clamping cmd_sg_entries to 255\n"); cmd_sg_entries = 255; } if (!indirect_sg_entries) indirect_sg_entries = cmd_sg_entries; else if (indirect_sg_entries < cmd_sg_entries) { - printk(KERN_WARNING PFX "Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", cmd_sg_entries); + pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", + cmd_sg_entries); indirect_sg_entries = cmd_sg_entries; } @@ -2486,7 +2490,7 @@ static int __init srp_init_module(void) ret = class_register(&srp_class); if (ret) { - printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); + pr_err("couldn't register class infiniband_srp\n"); srp_release_transport(ib_srp_transport_template); return ret; } @@ -2495,7 +2499,7 @@ static int __init srp_init_module(void) ret = ib_register_client(&srp_client); if (ret) { - printk(KERN_ERR PFX "couldn't register IB client\n"); + pr_err("couldn't register IB client\n"); srp_release_transport(ib_srp_transport_template); ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); From 683b159a2eef6544d49020eb9c88a94157df7d2d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 14 Jan 2012 12:40:44 +0000 Subject: [PATCH 10/15] IB/srp: Consolidate repetitive sysfs code Remove sysfs attributes before removing a target instead of testing the target state in every sysfs attribute callback method. Note: it is safe to invoke a sysfs attribute removal method like device_remove_file() twice on the same attribute. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 49 ++++++++++------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 895a9ff17379..bcbf22ee0aa7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -474,6 +474,21 @@ static void srp_free_req_data(struct srp_target_port *target) } } +/** + * srp_del_scsi_host_attr() - Remove attributes defined in the host template. + * @shost: SCSI host whose attributes to remove from sysfs. + * + * Note: Any attributes defined in the host template and that did not exist + * before invocation of this function will be ignored. + */ +static void srp_del_scsi_host_attr(struct Scsi_Host *shost) +{ + struct device_attribute **attr; + + for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr) + device_remove_file(&shost->shost_dev, *attr); +} + static void srp_remove_work(struct work_struct *work) { struct srp_target_port *target = @@ -486,6 +501,7 @@ static void srp_remove_work(struct work_struct *work) list_del(&target->list); spin_unlock(&target->srp_host->target_lock); + srp_del_scsi_host_attr(target->scsi_host); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); ib_destroy_cm_id(target->cm_id); @@ -1678,10 +1694,6 @@ static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "0x%016llx\n", (unsigned long long) be64_to_cpu(target->id_ext)); } @@ -1691,10 +1703,6 @@ static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "0x%016llx\n", (unsigned long long) be64_to_cpu(target->ioc_guid)); } @@ -1704,10 +1712,6 @@ static ssize_t show_service_id(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "0x%016llx\n", (unsigned long long) be64_to_cpu(target->service_id)); } @@ -1717,10 +1721,6 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); } @@ -1729,10 +1729,6 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "%pI6\n", target->path.dgid.raw); } @@ -1741,10 +1737,6 @@ static ssize_t show_orig_dgid(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "%pI6\n", target->orig_dgid); } @@ -1753,10 +1745,6 @@ static ssize_t show_req_lim(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "%d\n", target->req_lim); } @@ -1765,10 +1753,6 @@ static ssize_t show_zero_req_lim(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - return sprintf(buf, "%d\n", target->zero_req_lim); } @@ -2432,6 +2416,7 @@ static void srp_remove_one(struct ib_device *device) list_for_each_entry_safe(target, tmp_target, &host->target_list, list) { + srp_del_scsi_host_attr(target->scsi_host); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); srp_disconnect_target(target); From d474186f19d7ac1c7fbb293fdcfa46103e45e2ca Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 1 Mar 2012 19:55:21 +0200 Subject: [PATCH 11/15] IB/iser: Free IB connection resources in the proper place We allocate the login dma buffers in iser_verbs.c as part of alloc_ib_conn_resources(), however we are freeing them in iser_initiator.c as part of iser_free_rx_descriptors(). This is needlessly confusing. We have an alloc_rx_descriptors() and it doesn't alloc something that the free_rx_descriptors() frees, and we have an alloc_ib_conn_resources() that allocs something not freed by free_ib_conn_resources(). Clean that up. Signed-off-by: Doug Ledford [ Fix build error in iser_free_ib_conn_res(). - Or ] Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_initiator.c | 12 ------------ drivers/infiniband/ulp/iser/iser_verbs.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index a607542fc796..622e9857c869 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -220,18 +220,6 @@ void iser_free_rx_descriptors(struct iser_conn *ib_conn) struct iser_rx_desc *rx_desc; struct iser_device *device = ib_conn->device; - if (ib_conn->login_buf) { - if (ib_conn->login_req_dma) - ib_dma_unmap_single(device->ib_device, - ib_conn->login_req_dma, - ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); - if (ib_conn->login_resp_dma) - ib_dma_unmap_single(device->ib_device, - ib_conn->login_resp_dma, - ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->login_buf); - } - if (!ib_conn->rx_descs) return; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index e28877c4ce15..14224ba44fd8 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -274,6 +274,18 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) ib_conn->cma_id = NULL; kfree(ib_conn->page_vec); + if (ib_conn->login_buf) { + if (ib_conn->login_req_dma) + ib_dma_unmap_single(ib_conn->device->ib_device, + ib_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + if (ib_conn->login_resp_dma) + ib_dma_unmap_single(ib_conn->device->ib_device, + ib_conn->login_resp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + kfree(ib_conn->login_buf); + } + return 0; } From 89e984e2c2cd14f77ccb26c47726ac7f13b70ae8 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 5 Mar 2012 18:21:44 +0200 Subject: [PATCH 12/15] IB/iser: Post initial receive buffers before sending the final login request An iser target may send iscsi NO-OP PDUs as soon as it marks the iSER iSCSI session as fully operative. This means that there is window where there are no posted receive buffers on the initiator side, so it's possible for the iSER RC connection to break because of RNR NAK / retry errors. To fix this, rely on the flags bits in the login request to have FFP (0x3) in the lower nibble as a marker for the final login request, and post an initial chunk of receive buffers before sending that login request instead of after getting the login response. Signed-off-by: Or Gerlitz Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 18 +++-------- drivers/infiniband/ulp/iser/iscsi_iser.h | 1 + drivers/infiniband/ulp/iser/iser_initiator.c | 32 +++++++++++--------- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 9a43cb07f294..db43b3117168 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -364,6 +364,9 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, } ib_conn = ep->dd_data; + if (iser_alloc_rx_descriptors(ib_conn)) + return -ENOMEM; + /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges * connection pointers */ @@ -398,19 +401,6 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) iser_conn->ib_conn = NULL; } -static int -iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) -{ - struct iscsi_conn *conn = cls_conn->dd_data; - int err; - - err = iser_conn_set_full_featured_mode(conn); - if (err) - return err; - - return iscsi_conn_start(cls_conn); -} - static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) { struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); @@ -724,7 +714,7 @@ static struct iscsi_transport iscsi_iser_transport = { .get_conn_param = iscsi_conn_get_param, .get_ep_param = iscsi_iser_get_ep_param, .get_session_param = iscsi_session_get_param, - .start_conn = iscsi_iser_conn_start, + .start_conn = iscsi_conn_start, .stop_conn = iscsi_iser_conn_stop, /* iscsi host params */ .get_host_param = iscsi_host_get_param, diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index db7ea3704da7..296be431a0e9 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -366,4 +366,5 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task); int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 622e9857c869..a00ccd1ca333 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -170,7 +170,7 @@ static void iser_create_send_desc(struct iser_conn *ib_conn, } -static int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) +int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) { int i, j; u64 dma_addr; @@ -230,23 +230,24 @@ void iser_free_rx_descriptors(struct iser_conn *ib_conn) kfree(ib_conn->rx_descs); } -/** - * iser_conn_set_full_featured_mode - (iSER API) - */ -int iser_conn_set_full_featured_mode(struct iscsi_conn *conn) +static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) { struct iscsi_iser_conn *iser_conn = conn->dd_data; + iser_dbg("req op %x flags %x\n", req->opcode, req->flags); + /* check if this is the last login - going to full feature phase */ + if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) + return 0; + + /* + * Check that there is one posted recv buffer (for the last login + * response) and no posted send buffers left - they must have been + * consumed during previous login phases. + */ + WARN_ON(iser_conn->ib_conn->post_recv_buf_count != 1); + WARN_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); + iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX); - - /* Check that there is no posted recv or send buffers left - */ - /* they must be consumed during the login phase */ - BUG_ON(iser_conn->ib_conn->post_recv_buf_count != 0); - BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); - - if (iser_alloc_rx_descriptors(iser_conn->ib_conn)) - return -ENOMEM; - /* Initial post receive buffers */ if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX)) return -ENOMEM; @@ -426,6 +427,9 @@ int iser_send_control(struct iscsi_conn *conn, err = iser_post_recvl(iser_conn->ib_conn); if (err) goto send_control_error; + err = iser_post_rx_bufs(conn, task->hdr); + if (err) + goto send_control_error; } err = iser_post_send(iser_conn->ib_conn, mdesc); From bd50f8924c684f84416fb58c11eb24619b041f25 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Mon, 27 Feb 2012 17:02:56 -0500 Subject: [PATCH 13/15] IB/ehca: Fix ilog2() compile failure I'm getting compile failures building this driver, which I narrowed down to the ilog2 call in ehca_get_max_hwpage_size... ERROR: ".____ilog2_NaN" [drivers/infiniband/hw/ehca/ib_ehca.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 The use of shca->hca_cap_mr_pgsize is confusing the compiler, and resulting in the __builtin_constant_p in ilog2 going insane. I tried making it take the u32 pgsize as an argument and the expansion of shca->_pgsize in the caller, but that failed as well. With this patch in place, the driver compiles on my GCC 4.6.2 here. Suggested-by: Roland Dreier Signed-off-by: Kyle McMartin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_mrmw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 43cae84005f0..b781b2cb0624 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -112,7 +112,7 @@ static u32 ehca_encode_hwpage_size(u32 pgsize) static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca) { - return 1UL << ilog2(shca->hca_cap_mr_pgsize); + return rounddown_pow_of_two(shca->hca_cap_mr_pgsize); } static struct ehca_mr *ehca_mr_new(void) From 186834b5de69a89ba6cc846e7259451ced689b64 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Fri, 2 Mar 2012 00:01:19 +0000 Subject: [PATCH 14/15] RDMA/ucma: Fix AB-BA deadlock When we destroy a cm_id, we must purge associated events from the event queue. If the cm_id is for a listen request, we also purge corresponding pending connect requests. This requires destroying the cm_id's associated with the connect requests by calling rdma_destroy_id(). rdma_destroy_id() blocks until all outstanding callbacks have completed. The issue is that we hold file->mut while purging events from the event queue. We also acquire file->mut in our event handler. Calling rdma_destroy_id() while holding file->mut can lead to a deadlock, since the event handler callback cannot acquire file->mut, which prevents rdma_destroy_id() from completing. Fix this by moving events to purge from the event queue to a temporary list. We can then release file->mut and call rdma_destroy_id() outside of holding any locks. Bug report by Or Gerlitz : [ INFO: possible circular locking dependency detected ] 3.3.0-rc5-00008-g79f1e43-dirty #34 Tainted: G I tgtd/9018 is trying to acquire lock: (&id_priv->handler_mutex){+.+.+.}, at: [] rdma_destroy_id+0x33/0x1f0 [rdma_cm] but task is already holding lock: (&file->mut){+.+.+.}, at: [] ucma_free_ctx+0xb6/0x196 [rdma_ucm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&file->mut){+.+.+.}: [] lock_acquire+0xf0/0x116 [] mutex_lock_nested+0x64/0x2e6 [] ucma_event_handler+0x148/0x1dc [rdma_ucm] [] cma_ib_handler+0x1a7/0x1f7 [rdma_cm] [] cm_process_work+0x32/0x119 [ib_cm] [] cm_work_handler+0xfb8/0xfe5 [ib_cm] [] process_one_work+0x2bd/0x4a6 [] worker_thread+0x1d6/0x350 [] kthread+0x84/0x8c [] kernel_thread_helper+0x4/0x10 -> #0 (&id_priv->handler_mutex){+.+.+.}: [] __lock_acquire+0x10d5/0x1752 [] lock_acquire+0xf0/0x116 [] mutex_lock_nested+0x64/0x2e6 [] rdma_destroy_id+0x33/0x1f0 [rdma_cm] [] ucma_free_ctx+0x117/0x196 [rdma_ucm] [] ucma_close+0x77/0xb4 [rdma_ucm] [] fput+0x117/0x1cf [] filp_close+0x6d/0x78 [] put_files_struct+0xbd/0x17d [] exit_files+0x46/0x4e [] do_exit+0x299/0x75d [] do_group_exit+0x7e/0xa9 [] get_signal_to_deliver+0x536/0x555 [] do_signal+0x39/0x634 [] do_notify_resume+0x27/0x69 [] retint_signal+0x46/0x83 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&file->mut); lock(&id_priv->handler_mutex); lock(&file->mut); lock(&id_priv->handler_mutex); *** DEADLOCK *** 1 lock held by tgtd/9018: #0: (&file->mut){+.+.+.}, at: [] ucma_free_ctx+0xb6/0x196 [rdma_ucm] stack backtrace: Pid: 9018, comm: tgtd Tainted: G I 3.3.0-rc5-00008-g79f1e43-dirty #34 Call Trace: [] ? console_unlock+0x18e/0x207 [] print_circular_bug+0x28e/0x29f [] __lock_acquire+0x10d5/0x1752 [] lock_acquire+0xf0/0x116 [] ? rdma_destroy_id+0x33/0x1f0 [rdma_cm] [] mutex_lock_nested+0x64/0x2e6 [] ? rdma_destroy_id+0x33/0x1f0 [rdma_cm] [] ? trace_hardirqs_on_caller+0x11e/0x155 [] ? trace_hardirqs_on+0xd/0xf [] rdma_destroy_id+0x33/0x1f0 [rdma_cm] [] ucma_free_ctx+0x117/0x196 [rdma_ucm] [] ucma_close+0x77/0xb4 [rdma_ucm] [] fput+0x117/0x1cf [] filp_close+0x6d/0x78 [] put_files_struct+0xbd/0x17d [] ? put_files_struct+0x22/0x17d [] exit_files+0x46/0x4e [] do_exit+0x299/0x75d [] do_group_exit+0x7e/0xa9 [] get_signal_to_deliver+0x536/0x555 [] ? trace_hardirqs_on+0xd/0xf [] do_signal+0x39/0x634 [] ? printk+0x3c/0x45 [] ? trace_hardirqs_on_caller+0x11e/0x155 [] ? trace_hardirqs_on+0xd/0xf [] ? _raw_spin_unlock_irq+0x2b/0x40 [] ? set_current_blocked+0x44/0x49 [] ? retint_signal+0x11/0x83 [] do_notify_resume+0x27/0x69 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] retint_signal+0x46/0x83 Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/ucma.c | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 5034a87cc72d..5861cdb22b7c 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -449,24 +449,6 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) mutex_unlock(&mut); } -static void ucma_cleanup_events(struct ucma_context *ctx) -{ - struct ucma_event *uevent, *tmp; - - list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx != ctx) - continue; - - list_del(&uevent->list); - - /* clear incoming connections. */ - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) - rdma_destroy_id(uevent->cm_id); - - kfree(uevent); - } -} - static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; @@ -480,9 +462,16 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) } } +/* + * We cannot hold file->mut when calling rdma_destroy_id() or we can + * deadlock. We also acquire file->mut in ucma_event_handler(), and + * rdma_destroy_id() will wait until all callbacks have completed. + */ static int ucma_free_ctx(struct ucma_context *ctx) { int events_reported; + struct ucma_event *uevent, *tmp; + LIST_HEAD(list); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); @@ -491,10 +480,20 @@ static int ucma_free_ctx(struct ucma_context *ctx) /* Cleanup events not yet reported to the user. */ mutex_lock(&ctx->file->mut); - ucma_cleanup_events(ctx); + list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &list); + } list_del(&ctx->list); mutex_unlock(&ctx->file->mut); + list_for_each_entry_safe(uevent, tmp, &list, list) { + list_del(&uevent->list); + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + rdma_destroy_id(uevent->cm_id); + kfree(uevent); + } + events_reported = ctx->events_reported; kfree(ctx); return events_reported; From db4106ce635830201fad1bfca731a635beab6a72 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 7 Mar 2012 16:48:46 -0600 Subject: [PATCH 15/15] RDMA/cxgb3: Don't pass irq flags to flush_qp() Since flush_qp() is always called with irqs disabled, all the locking inside flush_qp() and __flush_qp() doesn't need irq save/restore. Further, passing the flag variable from iwch_modify_qp() is just wrong and causes a WARN_ON() in local_bh_enable(). Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/iwch_qp.c | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index bea5839d89ee..6de8463f453b 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -803,7 +803,7 @@ int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) * Assumes qhp lock is held. */ static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, - struct iwch_cq *schp, unsigned long *flag) + struct iwch_cq *schp) { int count; int flushed; @@ -812,44 +812,44 @@ static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); /* take a ref on the qhp since we must release the lock */ atomic_inc(&qhp->refcnt); - spin_unlock_irqrestore(&qhp->lock, *flag); + spin_unlock(&qhp->lock); /* locking hierarchy: cq lock first, then qp lock. */ - spin_lock_irqsave(&rchp->lock, *flag); + spin_lock(&rchp->lock); spin_lock(&qhp->lock); cxio_flush_hw_cq(&rchp->cq); cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); - spin_unlock_irqrestore(&rchp->lock, *flag); + spin_unlock(&rchp->lock); if (flushed) { - spin_lock_irqsave(&rchp->comp_handler_lock, *flag); + spin_lock(&rchp->comp_handler_lock); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); - spin_unlock_irqrestore(&rchp->comp_handler_lock, *flag); + spin_unlock(&rchp->comp_handler_lock); } /* locking hierarchy: cq lock first, then qp lock. */ - spin_lock_irqsave(&schp->lock, *flag); + spin_lock(&schp->lock); spin_lock(&qhp->lock); cxio_flush_hw_cq(&schp->cq); cxio_count_scqes(&schp->cq, &qhp->wq, &count); flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count); spin_unlock(&qhp->lock); - spin_unlock_irqrestore(&schp->lock, *flag); + spin_unlock(&schp->lock); if (flushed) { - spin_lock_irqsave(&schp->comp_handler_lock, *flag); + spin_lock(&schp->comp_handler_lock); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); - spin_unlock_irqrestore(&schp->comp_handler_lock, *flag); + spin_unlock(&schp->comp_handler_lock); } /* deref */ if (atomic_dec_and_test(&qhp->refcnt)) wake_up(&qhp->wait); - spin_lock_irqsave(&qhp->lock, *flag); + spin_lock(&qhp->lock); } -static void flush_qp(struct iwch_qp *qhp, unsigned long *flag) +static void flush_qp(struct iwch_qp *qhp) { struct iwch_cq *rchp, *schp; @@ -859,19 +859,19 @@ static void flush_qp(struct iwch_qp *qhp, unsigned long *flag) if (qhp->ibqp.uobject) { cxio_set_wq_in_error(&qhp->wq); cxio_set_cq_in_error(&rchp->cq); - spin_lock_irqsave(&rchp->comp_handler_lock, *flag); + spin_lock(&rchp->comp_handler_lock); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); - spin_unlock_irqrestore(&rchp->comp_handler_lock, *flag); + spin_unlock(&rchp->comp_handler_lock); if (schp != rchp) { cxio_set_cq_in_error(&schp->cq); - spin_lock_irqsave(&schp->comp_handler_lock, *flag); + spin_lock(&schp->comp_handler_lock); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); - spin_unlock_irqrestore(&schp->comp_handler_lock, *flag); + spin_unlock(&schp->comp_handler_lock); } return; } - __flush_qp(qhp, rchp, schp, flag); + __flush_qp(qhp, rchp, schp); } @@ -1030,7 +1030,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, break; case IWCH_QP_STATE_ERROR: qhp->attr.state = IWCH_QP_STATE_ERROR; - flush_qp(qhp, &flag); + flush_qp(qhp); break; default: ret = -EINVAL; @@ -1078,7 +1078,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, } switch (attrs->next_state) { case IWCH_QP_STATE_IDLE: - flush_qp(qhp, &flag); + flush_qp(qhp); qhp->attr.state = IWCH_QP_STATE_IDLE; qhp->attr.llp_stream_handle = NULL; put_ep(&qhp->ep->com); @@ -1132,7 +1132,7 @@ err: free=1; wake_up(&qhp->wait); BUG_ON(!ep); - flush_qp(qhp, &flag); + flush_qp(qhp); out: spin_unlock_irqrestore(&qhp->lock, flag);