net: mana: Fix race of mana_hwc_post_rx_wqe and new hwc response

The mana_hwc_rx_event_handler() / mana_hwc_handle_resp() calls
complete(&ctx->comp_event) before posting the wqe back. It's
possible that other callers, like mana_create_txq(), start the
next round of mana_hwc_send_request() before the posting of wqe.
And if the HW is fast enough to respond, it can hit no_wqe error
on the HW channel, then the response message is lost. The mana
driver may fail to create queues and open, because of waiting for
the HW response and timed out.
Sample dmesg:
[  528.610840] mana 39d4:00:02.0: HWC: Request timed out!
[  528.614452] mana 39d4:00:02.0: Failed to send mana message: -110, 0x0
[  528.618326] mana 39d4:00:02.0 enP14804s2: Failed to create WQ object: -110

To fix it, move posting of rx wqe before complete(&ctx->comp_event).

Cc: stable@vger.kernel.org
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Haiyang Zhang 2024-08-21 13:42:29 -07:00 committed by David S. Miller
parent 82b8000c28
commit 8af174ea86

View File

@ -52,32 +52,6 @@ static int mana_hwc_verify_resp_msg(const struct hwc_caller_ctx *caller_ctx,
return 0;
}
static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len,
const struct gdma_resp_hdr *resp_msg)
{
struct hwc_caller_ctx *ctx;
int err;
if (!test_bit(resp_msg->response.hwc_msg_id,
hwc->inflight_msg_res.map)) {
dev_err(hwc->dev, "hwc_rx: invalid msg_id = %u\n",
resp_msg->response.hwc_msg_id);
return;
}
ctx = hwc->caller_ctx + resp_msg->response.hwc_msg_id;
err = mana_hwc_verify_resp_msg(ctx, resp_msg, resp_len);
if (err)
goto out;
ctx->status_code = resp_msg->status;
memcpy(ctx->output_buf, resp_msg, resp_len);
out:
ctx->error = err;
complete(&ctx->comp_event);
}
static int mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq,
struct hwc_work_request *req)
{
@ -101,6 +75,40 @@ static int mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq,
return err;
}
static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len,
struct hwc_work_request *rx_req)
{
const struct gdma_resp_hdr *resp_msg = rx_req->buf_va;
struct hwc_caller_ctx *ctx;
int err;
if (!test_bit(resp_msg->response.hwc_msg_id,
hwc->inflight_msg_res.map)) {
dev_err(hwc->dev, "hwc_rx: invalid msg_id = %u\n",
resp_msg->response.hwc_msg_id);
mana_hwc_post_rx_wqe(hwc->rxq, rx_req);
return;
}
ctx = hwc->caller_ctx + resp_msg->response.hwc_msg_id;
err = mana_hwc_verify_resp_msg(ctx, resp_msg, resp_len);
if (err)
goto out;
ctx->status_code = resp_msg->status;
memcpy(ctx->output_buf, resp_msg, resp_len);
out:
ctx->error = err;
/* Must post rx wqe before complete(), otherwise the next rx may
* hit no_wqe error.
*/
mana_hwc_post_rx_wqe(hwc->rxq, rx_req);
complete(&ctx->comp_event);
}
static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
struct gdma_event *event)
{
@ -235,14 +243,12 @@ static void mana_hwc_rx_event_handler(void *ctx, u32 gdma_rxq_id,
return;
}
mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, resp);
mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, rx_req);
/* Do no longer use 'resp', because the buffer is posted to the HW
* in the below mana_hwc_post_rx_wqe().
/* Can no longer use 'resp', because the buffer is posted to the HW
* in mana_hwc_handle_resp() above.
*/
resp = NULL;
mana_hwc_post_rx_wqe(hwc_rxq, rx_req);
}
static void mana_hwc_tx_event_handler(void *ctx, u32 gdma_txq_id,