IB/hfi1: Add atomic triggered sleep/wakeup

When running iperf in a two host configuration the following trace can
occur:

[  319.728730] NETDEV WATCHDOG: ib0 (hfi1): transmit queue 0 timed out

The issue happens because the current implementation relies on the netif
txq being stopped to control the flushing of the tx list.

There are two resources that the transmit logic can wait on and stop the
txq:
- SDMA descriptors
- Ring space to hold completions

The ring space is tested on the sending side and relieved when the ring is
consumed in the napi tx reaping.

Unfortunately, that reaping can run conncurrently with the workqueue
flushing of the txlist.  If the txq is started just before the workitem
executes, the txlist will never be flushed, leading to the txq being
stuck.

Fix by:
- Adding sleep/wakeup wrappers
  * Use an atomic to control the call to the netif routines inside the
    wrappers

- Use another atomic to record ring space exhaustion
  * Only wakeup when the a ring space exhaustion has happened and it
    relieved

Add additional wrappers to clarify the ring space resource handling.

Fixes: d99dc602e2 ("IB/hfi1: Add functions to transmit datagram ipoib packets")
Link: https://lore.kernel.org/r/20200623204327.108092.4024.stgit@awfm-01.aw.intel.com
Reviewed-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
This commit is contained in:
Mike Marciniszyn 2020-06-23 16:43:28 -04:00 committed by Jason Gunthorpe
parent 82172b7655
commit 38fd98afee
2 changed files with 53 additions and 20 deletions

View File

@ -67,6 +67,9 @@ struct hfi1_ipoib_circ_buf {
* @sde: sdma engine
* @tx_list: tx request list
* @sent_txreqs: count of txreqs posted to sdma
* @stops: count of stops of queue
* @ring_full: ring has been filled
* @no_desc: descriptor shortage seen
* @flow: tracks when list needs to be flushed for a flow change
* @q_idx: ipoib Tx queue index
* @pkts_sent: indicator packets have been sent from this queue
@ -80,6 +83,9 @@ struct hfi1_ipoib_txq {
struct sdma_engine *sde;
struct list_head tx_list;
u64 sent_txreqs;
atomic_t stops;
atomic_t ring_full;
atomic_t no_desc;
union hfi1_ipoib_flow flow;
u8 q_idx;
bool pkts_sent;

View File

@ -55,23 +55,48 @@ static u64 hfi1_ipoib_txreqs(const u64 sent, const u64 completed)
return sent - completed;
}
static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq)
{
return hfi1_ipoib_txreqs(txq->sent_txreqs,
atomic64_read(&txq->complete_txreqs));
}
static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq)
{
if (atomic_inc_return(&txq->stops) == 1)
netif_stop_subqueue(txq->priv->netdev, txq->q_idx);
}
static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq)
{
if (atomic_dec_and_test(&txq->stops))
netif_wake_subqueue(txq->priv->netdev, txq->q_idx);
}
static uint hfi1_ipoib_ring_hwat(struct hfi1_ipoib_txq *txq)
{
return min_t(uint, txq->priv->netdev->tx_queue_len,
txq->tx_ring.max_items - 1);
}
static uint hfi1_ipoib_ring_lwat(struct hfi1_ipoib_txq *txq)
{
return min_t(uint, txq->priv->netdev->tx_queue_len,
txq->tx_ring.max_items) >> 1;
}
static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq)
{
if (unlikely(hfi1_ipoib_txreqs(++txq->sent_txreqs,
atomic64_read(&txq->complete_txreqs)) >=
min_t(unsigned int, txq->priv->netdev->tx_queue_len,
txq->tx_ring.max_items - 1)))
netif_stop_subqueue(txq->priv->netdev, txq->q_idx);
++txq->sent_txreqs;
if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) &&
!atomic_xchg(&txq->ring_full, 1))
hfi1_ipoib_stop_txq(txq);
}
static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq)
{
struct net_device *dev = txq->priv->netdev;
/* If the queue is already running just return */
if (likely(!__netif_subqueue_stopped(dev, txq->q_idx)))
return;
/* If shutting down just return as queue state is irrelevant */
if (unlikely(dev->reg_state != NETREG_REGISTERED))
return;
@ -86,11 +111,9 @@ static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq)
* Use the minimum of the current tx_queue_len or the rings max txreqs
* to protect against ring overflow.
*/
if (hfi1_ipoib_txreqs(txq->sent_txreqs,
atomic64_read(&txq->complete_txreqs))
< min_t(unsigned int, dev->tx_queue_len,
txq->tx_ring.max_items) >> 1)
netif_wake_subqueue(dev, txq->q_idx);
if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) &&
atomic_xchg(&txq->ring_full, 0))
hfi1_ipoib_wake_txq(txq);
}
static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget)
@ -608,13 +631,14 @@ static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde,
return -EAGAIN;
}
netif_stop_subqueue(txq->priv->netdev, txq->q_idx);
if (list_empty(&txreq->list))
/* came from non-list submit */
list_add_tail(&txreq->list, &txq->tx_list);
if (list_empty(&txq->wait.list))
if (list_empty(&txq->wait.list)) {
if (!atomic_xchg(&txq->no_desc, 1))
hfi1_ipoib_stop_txq(txq);
iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
}
write_sequnlock(&sde->waitlock);
return -EBUSY;
@ -649,9 +673,9 @@ static void hfi1_ipoib_flush_txq(struct work_struct *work)
struct net_device *dev = txq->priv->netdev;
if (likely(dev->reg_state == NETREG_REGISTERED) &&
likely(__netif_subqueue_stopped(dev, txq->q_idx)) &&
likely(!hfi1_ipoib_flush_tx_list(dev, txq)))
netif_wake_subqueue(dev, txq->q_idx);
if (atomic_xchg(&txq->no_desc, 0))
hfi1_ipoib_wake_txq(txq);
}
int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
@ -705,6 +729,9 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
txq->sde = NULL;
INIT_LIST_HEAD(&txq->tx_list);
atomic64_set(&txq->complete_txreqs, 0);
atomic_set(&txq->stops, 0);
atomic_set(&txq->ring_full, 0);
atomic_set(&txq->no_desc, 0);
txq->q_idx = i;
txq->flow.tx_queue = 0xff;
txq->flow.sc5 = 0xff;
@ -770,7 +797,7 @@ static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq)
atomic64_inc(complete_txreqs);
}
if (hfi1_ipoib_txreqs(txq->sent_txreqs, atomic64_read(complete_txreqs)))
if (hfi1_ipoib_used(txq))
dd_dev_warn(txq->priv->dd,
"txq %d not empty found %llu requests\n",
txq->q_idx,