Merge branch 'xdp_rxq_info'

Jesper Dangaard Brouer says:

====================
V4:
* Added reviewers/acks to patches
* Fix patch desc in i40e that got out-of-sync with code
* Add SPDX license headers for the two new files added in patch 14

V3:
* Fixed bug in virtio_net driver
* Removed export of xdp_rxq_info_init()

V2:
* Changed API exposed to drivers
  - Removed invocation of "init" in drivers, and only call "reg"
    (Suggested by Saeed)
  - Allow "reg" to fail and handle this in drivers
    (Suggested by David Ahern)
* Removed the SINKQ qtype, instead allow to register as "unused"
* Also fixed some drivers during testing on actual HW (noted in patches)

There is a need for XDP to know more about the RX-queue a given XDP
frames have arrived on.  For both the XDP bpf-prog and kernel side.

Instead of extending struct xdp_buff each time new info is needed,
this patchset takes a different approach.  Struct xdp_buff is only
extended with a pointer to a struct xdp_rxq_info (allowing for easier
extending this later).  This xdp_rxq_info contains information related
to how the driver have setup the individual RX-queue's.  This is
read-mostly information, and all xdp_buff frames (in drivers
napi_poll) point to the same xdp_rxq_info (per RX-queue).

We stress this data/cache-line is for read-mostly info.  This is NOT
for dynamic per packet info, use the data_meta for such use-cases.

This patchset start out small, and only expose ingress_ifindex and the
RX-queue index to the XDP/BPF program. Access to tangible info like
the ingress ifindex and RX queue index, is fairly easy to comprehent.
The other future use-cases could allow XDP frames to be recycled back
to the originating device driver, by providing info on RX device and
queue number.

As XDP doesn't have driver feature flags, and eBPF code due to
bpf-tail-calls cannot determine that XDP driver invoke it, this
patchset have to update every driver that support XDP.

For driver developers (review individual driver patches!):

The xdp_rxq_info is tied to the drivers RX-ring(s). Whenever a RX-ring
modification require (temporary) stopping RX frames, then the
xdp_rxq_info should (likely) also be unregistred and re-registered,
especially if reallocating the pages in the ring. Make sure ethtool
set_channels does the right thing. When replacing XDP prog, if and
only if RX-ring need to be changed, then also re-register the
xdp_rxq_info.

I'm Cc'ing the individual driver patches to the registered maintainers.

Testing:

I've only tested the NIC drivers I have hardware for.  The general
test procedure is to (DUT = Device Under Test):
 (1) run pktgen script pktgen_sample04_many_flows.sh       (against DUT)
 (2) run samples/bpf program xdp_rxq_info --dev $DEV       (on DUT)
 (3) runtime modify number of NIC queues via ethtool -L    (on DUT)
 (4) runtime modify number of NIC ring-size via ethtool -G (on DUT)

Patch based on git tree bpf-next (at commit fb982666e380c1632a):
 https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2018-01-05 15:30:47 -08:00
commit 11d16edb04
36 changed files with 990 additions and 28 deletions

View File

@ -2247,6 +2247,9 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
if (rxr->xdp_prog)
bpf_prog_put(rxr->xdp_prog);
if (xdp_rxq_info_is_reg(&rxr->xdp_rxq))
xdp_rxq_info_unreg(&rxr->xdp_rxq);
kfree(rxr->rx_tpa);
rxr->rx_tpa = NULL;
@ -2280,6 +2283,10 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
ring = &rxr->rx_ring_struct;
rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i);
if (rc < 0)
return rc;
rc = bnxt_alloc_ring(bp, ring);
if (rc)
return rc;
@ -2834,6 +2841,9 @@ void bnxt_set_ring_params(struct bnxt *bp)
bp->cp_ring_mask = bp->cp_bit - 1;
}
/* Changing allocation mode of RX rings.
* TODO: Update when extending xdp_rxq_info to support allocation modes.
*/
int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode)
{
if (page_mode) {

View File

@ -23,6 +23,7 @@
#include <net/devlink.h>
#include <net/dst_metadata.h>
#include <net/switchdev.h>
#include <net/xdp.h>
struct tx_bd {
__le32 tx_bd_len_flags_type;
@ -664,6 +665,7 @@ struct bnxt_rx_ring_info {
struct bnxt_ring_struct rx_ring_struct;
struct bnxt_ring_struct rx_agg_ring_struct;
struct xdp_rxq_info xdp_rxq;
};
struct bnxt_cp_ring_info {

View File

@ -96,6 +96,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
xdp.data = *data_ptr;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = *data_ptr + *len;
xdp.rxq = &rxr->xdp_rxq;
orig_data = xdp.data;
mapping = rx_buf->mapping - bp->rx_dma_offset;

View File

@ -521,7 +521,7 @@ static void nicvf_unmap_page(struct nicvf *nic, struct page *page, u64 dma_addr)
static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
struct cqe_rx_t *cqe_rx, struct snd_queue *sq,
struct sk_buff **skb)
struct rcv_queue *rq, struct sk_buff **skb)
{
struct xdp_buff xdp;
struct page *page;
@ -545,6 +545,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
xdp.data = (void *)cpu_addr;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &rq->xdp_rxq;
orig_data = xdp.data;
rcu_read_lock();
@ -698,7 +699,8 @@ static inline void nicvf_set_rxhash(struct net_device *netdev,
static void nicvf_rcv_pkt_handler(struct net_device *netdev,
struct napi_struct *napi,
struct cqe_rx_t *cqe_rx, struct snd_queue *sq)
struct cqe_rx_t *cqe_rx,
struct snd_queue *sq, struct rcv_queue *rq)
{
struct sk_buff *skb = NULL;
struct nicvf *nic = netdev_priv(netdev);
@ -724,7 +726,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
/* For XDP, ignore pkts spanning multiple pages */
if (nic->xdp_prog && (cqe_rx->rb_cnt == 1)) {
/* Packet consumed by XDP */
if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq, &skb))
if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq, rq, &skb))
return;
} else {
skb = nicvf_get_rcv_skb(snic, cqe_rx,
@ -781,6 +783,7 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
struct cqe_rx_t *cq_desc;
struct netdev_queue *txq;
struct snd_queue *sq = &qs->sq[cq_idx];
struct rcv_queue *rq = &qs->rq[cq_idx];
unsigned int tx_pkts = 0, tx_bytes = 0, txq_idx;
spin_lock_bh(&cq->lock);
@ -811,7 +814,7 @@ loop:
switch (cq_desc->cqe_type) {
case CQE_TYPE_RX:
nicvf_rcv_pkt_handler(netdev, napi, cq_desc, sq);
nicvf_rcv_pkt_handler(netdev, napi, cq_desc, sq, rq);
work_done++;
break;
case CQE_TYPE_SEND:

View File

@ -760,6 +760,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
if (!rq->enable) {
nicvf_reclaim_rcv_queue(nic, qs, qidx);
xdp_rxq_info_unreg(&rq->xdp_rxq);
return;
}
@ -772,6 +773,9 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
/* all writes of RBDR data to be loaded into L2 Cache as well*/
rq->caching = 1;
/* Driver have no proper error path for failed XDP RX-queue info reg */
WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0);
/* Send a mailbox msg to PF to config RQ */
mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;
mbx.rq.qs_num = qs->vnic_id;

View File

@ -12,6 +12,7 @@
#include <linux/netdevice.h>
#include <linux/iommu.h>
#include <linux/bpf.h>
#include <net/xdp.h>
#include "q_struct.h"
#define MAX_QUEUE_SET 128
@ -255,6 +256,7 @@ struct rcv_queue {
u8 start_qs_rbdr_idx; /* RBDR idx in the above QS */
u8 caching;
struct rx_tx_queue_stats stats;
struct xdp_rxq_info xdp_rxq;
} ____cacheline_aligned_in_smp;
struct cmp_queue {

View File

@ -1585,6 +1585,8 @@ static int i40e_set_ringparam(struct net_device *netdev,
*/
rx_rings[i].desc = NULL;
rx_rings[i].rx_bi = NULL;
/* Clear cloned XDP RX-queue info before setup call */
memset(&rx_rings[i].xdp_rxq, 0, sizeof(rx_rings[i].xdp_rxq));
/* this is to allow wr32 to have something to write to
* during early allocation of Rx buffers
*/

View File

@ -27,6 +27,7 @@
#include <linux/prefetch.h>
#include <net/busy_poll.h>
#include <linux/bpf_trace.h>
#include <net/xdp.h>
#include "i40e.h"
#include "i40e_trace.h"
#include "i40e_prototype.h"
@ -1236,6 +1237,8 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
void i40e_free_rx_resources(struct i40e_ring *rx_ring)
{
i40e_clean_rx_ring(rx_ring);
if (rx_ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
rx_ring->xdp_prog = NULL;
kfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
@ -1256,6 +1259,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
int err = -ENOMEM;
int bi_size;
/* warn if we are about to overwrite the pointer */
@ -1283,13 +1287,21 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
/* XDP RX-queue info only needed for RX rings exposed to XDP */
if (rx_ring->vsi->type == I40E_VSI_MAIN) {
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->queue_index);
if (err < 0)
goto err;
}
rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
return 0;
err:
kfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
return -ENOMEM;
return err;
}
/**
@ -2068,11 +2080,13 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
struct sk_buff *skb = rx_ring->skb;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
bool failure = false, xdp_xmit = false;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
while (likely(total_rx_packets < (unsigned int)budget)) {
struct i40e_rx_buffer *rx_buffer;
union i40e_rx_desc *rx_desc;
struct xdp_buff xdp;
unsigned int size;
u16 vlan_tag;
u8 rx_ptype;

View File

@ -27,6 +27,8 @@
#ifndef _I40E_TXRX_H_
#define _I40E_TXRX_H_
#include <net/xdp.h>
/* Interrupt Throttling and Rate Limiting Goodies */
#define I40E_MAX_ITR 0x0FF0 /* reg uses 2 usec resolution */
@ -428,6 +430,7 @@ struct i40e_ring {
*/
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
} ____cacheline_internodealigned_in_smp;
static inline bool ring_uses_build_skb(struct i40e_ring *ring)

View File

@ -53,6 +53,7 @@
#include <linux/dca.h>
#endif
#include <net/xdp.h>
#include <net/busy_poll.h>
/* common prefix used by pr_<> macros */
@ -371,6 +372,7 @@ struct ixgbe_ring {
struct ixgbe_tx_queue_stats tx_stats;
struct ixgbe_rx_queue_stats rx_stats;
};
struct xdp_rxq_info xdp_rxq;
} ____cacheline_internodealigned_in_smp;
enum ixgbe_ring_f_enum {

View File

@ -1156,6 +1156,10 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
memcpy(&temp_ring[i], adapter->rx_ring[i],
sizeof(struct ixgbe_ring));
/* Clear copied XDP RX-queue info */
memset(&temp_ring[i].xdp_rxq, 0,
sizeof(temp_ring[i].xdp_rxq));
temp_ring[i].count = new_rx_count;
err = ixgbe_setup_rx_resources(adapter, &temp_ring[i]);
if (err) {

View File

@ -2318,12 +2318,14 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
#endif /* IXGBE_FCOE */
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
bool xdp_xmit = false;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *rx_buffer;
struct sk_buff *skb;
struct xdp_buff xdp;
unsigned int size;
/* return some buffers to hardware, one at a time is too slow */
@ -6444,6 +6446,11 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
/* XDP RX-queue info */
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
rx_ring->queue_index) < 0)
goto err;
rx_ring->xdp_prog = adapter->xdp_prog;
return 0;
@ -6541,6 +6548,7 @@ void ixgbe_free_rx_resources(struct ixgbe_ring *rx_ring)
ixgbe_clean_rx_ring(rx_ring);
rx_ring->xdp_prog = NULL;
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
vfree(rx_ring->rx_buffer_info);
rx_ring->rx_buffer_info = NULL;

View File

@ -2172,8 +2172,9 @@ static int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
prof->rx_ring_size, priv->stride,
node))
node, i))
goto err;
}
#ifdef CONFIG_RFS_ACCEL

View File

@ -262,7 +262,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring **pring,
u32 size, u16 stride, int node)
u32 size, u16 stride, int node, int queue_index)
{
struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_rx_ring *ring;
@ -286,6 +286,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
ring->log_stride = ffs(ring->stride) - 1;
ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0)
goto err_ring;
tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
sizeof(struct mlx4_en_rx_alloc));
ring->rx_info = vzalloc_node(tmp, node);
@ -293,7 +296,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
ring->rx_info = vzalloc(tmp);
if (!ring->rx_info) {
err = -ENOMEM;
goto err_ring;
goto err_xdp_info;
}
}
@ -317,6 +320,8 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
err_info:
vfree(ring->rx_info);
ring->rx_info = NULL;
err_xdp_info:
xdp_rxq_info_unreg(&ring->xdp_rxq);
err_ring:
kfree(ring);
*pring = NULL;
@ -440,6 +445,7 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
lockdep_is_held(&mdev->state_lock));
if (old_prog)
bpf_prog_put(old_prog);
xdp_rxq_info_unreg(&ring->xdp_rxq);
mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
vfree(ring->rx_info);
ring->rx_info = NULL;
@ -652,6 +658,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
int cq_ring = cq->ring;
bool doorbell_pending;
struct mlx4_cqe *cqe;
struct xdp_buff xdp;
int polled = 0;
int index;
@ -666,6 +673,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
/* Protect accesses to: ring->xdp_prog, priv->mac_hash list */
rcu_read_lock();
xdp_prog = rcu_dereference(ring->xdp_prog);
xdp.rxq = &ring->xdp_rxq;
doorbell_pending = 0;
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
@ -750,7 +758,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
* read bytes but not past the end of the frag.
*/
if (xdp_prog) {
struct xdp_buff xdp;
dma_addr_t dma;
void *orig_data;
u32 act;

View File

@ -46,6 +46,7 @@
#endif
#include <linux/cpu_rmap.h>
#include <linux/ptp_clock_kernel.h>
#include <net/xdp.h>
#include <linux/mlx4/device.h>
#include <linux/mlx4/qp.h>
@ -356,6 +357,7 @@ struct mlx4_en_rx_ring {
unsigned long dropped;
int hwtstamp_rx_filter;
cpumask_var_t affinity_mask;
struct xdp_rxq_info xdp_rxq;
};
struct mlx4_en_cq {
@ -720,7 +722,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev);
void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv);
int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring **pring,
u32 size, u16 stride, int node);
u32 size, u16 stride, int node, int queue_index);
void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring **pring,
u32 size, u16 stride);

View File

@ -46,6 +46,7 @@
#include <linux/mlx5/transobj.h>
#include <linux/rhashtable.h>
#include <net/switchdev.h>
#include <net/xdp.h>
#include "wq.h"
#include "mlx5_core.h"
#include "en_stats.h"
@ -571,6 +572,9 @@ struct mlx5e_rq {
u32 rqn;
struct mlx5_core_dev *mdev;
struct mlx5_core_mkey umr_mkey;
/* XDP read-mostly */
struct xdp_rxq_info xdp_rxq;
} ____cacheline_aligned_in_smp;
struct mlx5e_channel {

View File

@ -582,6 +582,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
goto err_rq_wq_destroy;
}
if (xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix) < 0)
goto err_rq_wq_destroy;
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = params->rq_headroom;
@ -687,6 +690,7 @@ err_destroy_umr_mkey:
err_rq_wq_destroy:
if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq);
mlx5_wq_destroy(&rq->wq_ctrl);
return err;
@ -699,6 +703,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq);
switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
mlx5e_rq_free_mpwqe_info(rq);
@ -2766,6 +2772,9 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
if (err)
return err;
/* Mark as unused given "Drop-RQ" packets never reach XDP */
xdp_rxq_info_unused(&rq->xdp_rxq);
rq->mdev = mdev;
return 0;

View File

@ -812,6 +812,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.data_hard_start = va;
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(prog, &xdp);
switch (act) {

View File

@ -47,6 +47,7 @@
#include <linux/netdevice.h>
#include <linux/pci.h>
#include <linux/io-64-nonatomic-hi-lo.h>
#include <net/xdp.h>
#include "nfp_net_ctrl.h"
@ -350,6 +351,7 @@ struct nfp_net_rx_buf {
* @rxds: Virtual address of FL/RX ring in host memory
* @dma: DMA address of the FL/RX ring
* @size: Size, in bytes, of the FL/RX ring (needed to free)
* @xdp_rxq: RX-ring info avail for XDP
*/
struct nfp_net_rx_ring {
struct nfp_net_r_vector *r_vec;
@ -361,13 +363,14 @@ struct nfp_net_rx_ring {
u32 idx;
int fl_qcidx;
unsigned int size;
u8 __iomem *qcp_fl;
struct nfp_net_rx_buf *rxbufs;
struct nfp_net_rx_desc *rxds;
dma_addr_t dma;
unsigned int size;
struct xdp_rxq_info xdp_rxq;
} ____cacheline_aligned;
/**

View File

@ -1608,11 +1608,13 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
unsigned int true_bufsz;
struct sk_buff *skb;
int pkts_polled = 0;
struct xdp_buff xdp;
int idx;
rcu_read_lock();
xdp_prog = READ_ONCE(dp->xdp_prog);
true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz;
xdp.rxq = &rx_ring->xdp_rxq;
tx_ring = r_vec->xdp_ring;
while (pkts_polled < budget) {
@ -1703,7 +1705,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
dp->bpf_offload_xdp) && !meta.portid) {
void *orig_data = rxbuf->frag + pkt_off;
unsigned int dma_off;
struct xdp_buff xdp;
int act;
xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
@ -2252,6 +2253,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
kfree(rx_ring->rxbufs);
if (rx_ring->rxds)
@ -2275,7 +2277,11 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
static int
nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
{
int sz;
int sz, err;
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev, rx_ring->idx);
if (err < 0)
return err;
rx_ring->cnt = dp->rxd_cnt;
rx_ring->size = sizeof(*rx_ring->rxds) * rx_ring->cnt;

View File

@ -40,6 +40,7 @@
#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/bpf.h>
#include <net/xdp.h>
#include <linux/qed/qede_rdma.h>
#include <linux/io.h>
#ifdef CONFIG_RFS_ACCEL
@ -345,6 +346,7 @@ struct qede_rx_queue {
u64 xdp_no_pass;
void *handle;
struct xdp_rxq_info xdp_rxq;
};
union db_prod {

View File

@ -1006,6 +1006,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
xdp.data = xdp.data_hard_start + *data_offset;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.rxq = &rxq->xdp_rxq;
/* Queues always have a full reset currently, so for the time
* being until there's atomic program replace just mark read

View File

@ -765,6 +765,12 @@ static void qede_free_fp_array(struct qede_dev *edev)
fp = &edev->fp_array[i];
kfree(fp->sb_info);
/* Handle mem alloc failure case where qede_init_fp
* didn't register xdp_rxq_info yet.
* Implicit only (fp->type & QEDE_FASTPATH_RX)
*/
if (fp->rxq && xdp_rxq_info_is_reg(&fp->rxq->xdp_rxq))
xdp_rxq_info_unreg(&fp->rxq->xdp_rxq);
kfree(fp->rxq);
kfree(fp->xdp_tx);
kfree(fp->txq);
@ -1493,6 +1499,10 @@ static void qede_init_fp(struct qede_dev *edev)
else
fp->rxq->data_direction = DMA_FROM_DEVICE;
fp->rxq->dev = &edev->pdev->dev;
/* Driver have no error path from here */
WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev,
fp->rxq->rxq_id) < 0);
}
if (fp->type & QEDE_FASTPATH_TX) {

View File

@ -180,6 +180,7 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
struct xdp_rxq_info xdp_rxq;
};
struct tun_flow_entry {
@ -687,8 +688,10 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
if (tun)
if (tun) {
skb_array_cleanup(&tfile->tx_array);
xdp_rxq_info_unreg(&tfile->xdp_rxq);
}
sock_put(&tfile->sk);
}
}
@ -728,11 +731,13 @@ static void tun_detach_all(struct net_device *dev)
tun_napi_del(tun, tfile);
/* Drop read queue */
tun_queue_purge(tfile);
xdp_rxq_info_unreg(&tfile->xdp_rxq);
sock_put(&tfile->sk);
}
list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
tun_enable_queue(tfile);
tun_queue_purge(tfile);
xdp_rxq_info_unreg(&tfile->xdp_rxq);
sock_put(&tfile->sk);
}
BUG_ON(tun->numdisabled != 0);
@ -784,6 +789,22 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
if (tfile->detached) {
/* Re-attach detached tfile, updating XDP queue_index */
WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
if (tfile->xdp_rxq.queue_index != tfile->queue_index)
tfile->xdp_rxq.queue_index = tfile->queue_index;
} else {
/* Setup XDP RX-queue info, for new tfile getting attached */
err = xdp_rxq_info_reg(&tfile->xdp_rxq,
tun->dev, tfile->queue_index);
if (err < 0)
goto out;
err = 0;
}
rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++;
@ -1508,6 +1529,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp.data = buf + pad;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);

View File

@ -31,6 +31,7 @@
#include <linux/average.h>
#include <linux/filter.h>
#include <net/route.h>
#include <net/xdp.h>
static int napi_weight = NAPI_POLL_WEIGHT;
module_param(napi_weight, int, 0444);
@ -115,6 +116,8 @@ struct receive_queue {
/* Name of this receive queue: input.$index */
char name[40];
struct xdp_rxq_info xdp_rxq;
};
struct virtnet_info {
@ -559,6 +562,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp.data = xdp.data_hard_start + xdp_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &rq->xdp_rxq;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
@ -692,6 +696,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdp.data = data + vi->hdr_len;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + (len - vi->hdr_len);
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
if (act != XDP_PASS)
@ -1225,13 +1231,18 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
static int virtnet_open(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
int i;
int i, err;
for (i = 0; i < vi->max_queue_pairs; i++) {
if (i < vi->curr_queue_pairs)
/* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i);
if (err < 0)
return err;
virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
}
@ -1560,6 +1571,7 @@ static int virtnet_close(struct net_device *dev)
cancel_delayed_work_sync(&vi->refill);
for (i = 0; i < vi->max_queue_pairs; i++) {
xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
napi_disable(&vi->rq[i].napi);
virtnet_napi_tx_disable(&vi->sq[i].napi);
}

View File

@ -20,6 +20,7 @@
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <net/xdp.h>
#include <net/sch_generic.h>
#include <uapi/linux/filter.h>
@ -503,6 +504,7 @@ struct xdp_buff {
void *data_end;
void *data_meta;
void *data_hard_start;
struct xdp_rxq_info *rxq;
};
/* Compute the linear packet data range [data, data_end) which

View File

@ -44,6 +44,7 @@
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <net/xdp.h>
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
@ -686,6 +687,7 @@ struct netdev_rx_queue {
#endif
struct kobject kobj;
struct net_device *dev;
struct xdp_rxq_info xdp_rxq;
} ____cacheline_aligned_in_smp;
/*

48
include/net/xdp.h Normal file
View File

@ -0,0 +1,48 @@
/* include/net/xdp.h
*
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
* Released under terms in GPL version 2. See COPYING.
*/
#ifndef __LINUX_NET_XDP_H__
#define __LINUX_NET_XDP_H__
/**
* DOC: XDP RX-queue information
*
* The XDP RX-queue info (xdp_rxq_info) is associated with the driver
* level RX-ring queues. It is information that is specific to how
* the driver have configured a given RX-ring queue.
*
* Each xdp_buff frame received in the driver carry a (pointer)
* reference to this xdp_rxq_info structure. This provides the XDP
* data-path read-access to RX-info for both kernel and bpf-side
* (limited subset).
*
* For now, direct access is only safe while running in NAPI/softirq
* context. Contents is read-mostly and must not be updated during
* driver NAPI/softirq poll.
*
* The driver usage API is a register and unregister API.
*
* The struct is not directly tied to the XDP prog. A new XDP prog
* can be attached as long as it doesn't change the underlying
* RX-ring. If the RX-ring does change significantly, the NIC driver
* naturally need to stop the RX-ring before purging and reallocating
* memory. In that process the driver MUST call unregistor (which
* also apply for driver shutdown and unload). The register API is
* also mandatory during RX-ring setup.
*/
struct xdp_rxq_info {
struct net_device *dev;
u32 queue_index;
u32 reg_state;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
#endif /* __LINUX_NET_XDP_H__ */

View File

@ -899,6 +899,9 @@ struct xdp_md {
__u32 data;
__u32 data_end;
__u32 data_meta;
/* Below access go though struct xdp_rxq_info */
__u32 ingress_ifindex; /* rxq->dev->ifindex */
__u32 rx_queue_index; /* rxq->queue_index */
};
enum sk_action {

View File

@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o
fib_notifier.o xdp.o
obj-y += net-sysfs.o
obj-$(CONFIG_PROC_FS) += net-procfs.o

View File

@ -3906,9 +3906,33 @@ drop:
return NET_RX_DROP;
}
static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct netdev_rx_queue *rxqueue;
rxqueue = dev->_rx;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
"of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
return rxqueue; /* Return first rxqueue */
}
rxqueue += index;
}
return rxqueue;
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
struct netdev_rx_queue *rxqueue;
u32 metalen, act = XDP_DROP;
struct xdp_buff xdp;
void *orig_data;
@ -3952,6 +3976,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp.data_hard_start = skb->data - skb_headroom(skb);
orig_data = xdp.data;
rxqueue = netif_get_rxqueue(skb);
xdp.rxq = &rxqueue->xdp_rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
off = xdp.data - orig_data;
@ -7589,12 +7616,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
#ifdef CONFIG_SYSFS
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
struct netdev_rx_queue *rx;
size_t sz = count * sizeof(*rx);
int err = 0;
BUG_ON(count < 1);
@ -7604,11 +7631,39 @@ static int netif_alloc_rx_queues(struct net_device *dev)
dev->_rx = rx;
for (i = 0; i < count; i++)
for (i = 0; i < count; i++) {
rx[i].dev = dev;
/* XDP RX-queue setup */
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
if (err < 0)
goto err_rxq_info;
}
return 0;
err_rxq_info:
/* Rollback successful reg's and free other resources */
while (i--)
xdp_rxq_info_unreg(&rx[i].xdp_rxq);
kfree(dev->_rx);
dev->_rx = NULL;
return err;
}
static void netif_free_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
struct netdev_rx_queue *rx;
/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
if (!dev->_rx)
return;
rx = dev->_rx;
for (i = 0; i < count; i++)
xdp_rxq_info_unreg(&rx[i].xdp_rxq);
}
#endif
static void netdev_init_one_queue(struct net_device *dev,
struct netdev_queue *queue, void *_unused)
@ -8169,12 +8224,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
#ifdef CONFIG_SYSFS
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
#endif
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
@ -8231,12 +8284,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (netif_alloc_netdev_queues(dev))
goto free_all;
#ifdef CONFIG_SYSFS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
#endif
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
@ -8275,9 +8326,7 @@ void free_netdev(struct net_device *dev)
might_sleep();
netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
kvfree(dev->_rx);
#endif
netif_free_rx_queues(dev);
kfree(rcu_dereference_protected(dev->ingress_queue, 1));

View File

@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, data_end));
break;
case offsetof(struct xdp_md, ingress_ifindex):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, rxq));
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
si->dst_reg, si->dst_reg,
offsetof(struct xdp_rxq_info, dev));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
bpf_target_off(struct net_device,
ifindex, 4, target_size));
break;
case offsetof(struct xdp_md, rx_queue_index):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
si->dst_reg, si->src_reg,
offsetof(struct xdp_buff, rxq));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
bpf_target_off(struct xdp_rxq_info,
queue_index, 4, target_size));
break;
}
return insn - insn_buf;

73
net/core/xdp.c Normal file
View File

@ -0,0 +1,73 @@
/* net/core/xdp.c
*
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
* Released under terms in GPL version 2. See COPYING.
*/
#include <linux/types.h>
#include <linux/mm.h>
#include <net/xdp.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
#define REG_STATE_UNREGISTERED 0x2
#define REG_STATE_UNUSED 0x3
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
if (xdp_rxq->reg_state == REG_STATE_UNUSED)
return;
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
{
memset(xdp_rxq, 0, sizeof(*xdp_rxq));
}
/* Returns 0 on success, negative on failure */
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index)
{
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
WARN(1, "Driver promised not to register this");
return -EINVAL;
}
if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
WARN(1, "Missing unregister, handled but fix driver");
xdp_rxq_info_unreg(xdp_rxq);
}
if (!dev) {
WARN(1, "Missing net_device from driver");
return -ENODEV;
}
/* State either UNREGISTERED or NEW */
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
{
xdp_rxq->reg_state = REG_STATE_UNUSED;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
{
return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);

View File

@ -41,6 +41,7 @@ hostprogs-y += xdp_redirect
hostprogs-y += xdp_redirect_map
hostprogs-y += xdp_redirect_cpu
hostprogs-y += xdp_monitor
hostprogs-y += xdp_rxq_info
hostprogs-y += syscall_tp
# Libbpf dependencies
@ -90,6 +91,7 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
# Tell kbuild to always build the programs
@ -139,6 +141,7 @@ always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
always += xdp_monitor_kern.o
always += xdp_rxq_info_kern.o
always += syscall_tp_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@ -182,6 +185,7 @@ HOSTLOADLIBES_xdp_redirect += -lelf
HOSTLOADLIBES_xdp_redirect_map += -lelf
HOSTLOADLIBES_xdp_redirect_cpu += -lelf
HOSTLOADLIBES_xdp_monitor += -lelf
HOSTLOADLIBES_xdp_rxq_info += -lelf
HOSTLOADLIBES_syscall_tp += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:

View File

@ -0,0 +1,96 @@
/* SPDX-License-Identifier: GPL-2.0
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*
* Example howto extract XDP RX-queue info
*/
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
/* Config setup from with userspace
*
* User-side setup ifindex in config_map, to verify that
* ctx->ingress_ifindex is correct (against configured ifindex)
*/
struct config {
__u32 action;
int ifindex;
};
struct bpf_map_def SEC("maps") config_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(struct config),
.max_entries = 1,
};
/* Common stats data record (shared with userspace) */
struct datarec {
__u64 processed;
__u64 issue;
};
struct bpf_map_def SEC("maps") stats_global_map = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(struct datarec),
.max_entries = 1,
};
#define MAX_RXQs 64
/* Stats per rx_queue_index (per CPU) */
struct bpf_map_def SEC("maps") rx_queue_index_map = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(struct datarec),
.max_entries = MAX_RXQs + 1,
};
SEC("xdp_prog0")
int xdp_prognum0(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct datarec *rec, *rxq_rec;
int ingress_ifindex;
struct config *config;
u32 key = 0;
/* Global stats record */
rec = bpf_map_lookup_elem(&stats_global_map, &key);
if (!rec)
return XDP_ABORTED;
rec->processed++;
/* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
* instructions inside kernel to access xdp_rxq->dev->ifindex
*/
ingress_ifindex = ctx->ingress_ifindex;
config = bpf_map_lookup_elem(&config_map, &key);
if (!config)
return XDP_ABORTED;
/* Simple test: check ctx provided ifindex is as expected */
if (ingress_ifindex != config->ifindex) {
/* count this error case */
rec->issue++;
return XDP_ABORTED;
}
/* Update stats per rx_queue_index. Handle if rx_queue_index
* is larger than stats map can contain info for.
*/
key = ctx->rx_queue_index;
if (key >= MAX_RXQs)
key = MAX_RXQs;
rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
if (!rxq_rec)
return XDP_ABORTED;
rxq_rec->processed++;
if (key == MAX_RXQs)
rxq_rec->issue++;
return config->action;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,531 @@
/* SPDX-License-Identifier: GPL-2.0
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
static const char *__doc__ = " XDP RX-queue info extract example\n\n"
"Monitor how many packets per sec (pps) are received\n"
"per NIC RX queue index and which CPU processed the packet\n"
;
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <locale.h>
#include <sys/resource.h>
#include <getopt.h>
#include <net/if.h>
#include <time.h>
#include <arpa/inet.h>
#include <linux/if_link.h>
#include "libbpf.h"
#include "bpf_load.h"
#include "bpf_util.h"
static int ifindex = -1;
static char ifname_buf[IF_NAMESIZE];
static char *ifname;
static __u32 xdp_flags;
/* Exit return codes */
#define EXIT_OK 0
#define EXIT_FAIL 1
#define EXIT_FAIL_OPTION 2
#define EXIT_FAIL_XDP 3
#define EXIT_FAIL_BPF 4
#define EXIT_FAIL_MEM 5
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
{"dev", required_argument, NULL, 'd' },
{"skb-mode", no_argument, NULL, 'S' },
{"sec", required_argument, NULL, 's' },
{"no-separators", no_argument, NULL, 'z' },
{"action", required_argument, NULL, 'a' },
{0, 0, NULL, 0 }
};
static void int_exit(int sig)
{
fprintf(stderr,
"Interrupted: Removing XDP program on ifindex:%d device:%s\n",
ifindex, ifname);
if (ifindex > -1)
set_link_xdp_fd(ifindex, -1, xdp_flags);
exit(EXIT_OK);
}
struct config {
__u32 action;
int ifindex;
};
#define XDP_ACTION_MAX (XDP_TX + 1)
#define XDP_ACTION_MAX_STRLEN 11
static const char *xdp_action_names[XDP_ACTION_MAX] = {
[XDP_ABORTED] = "XDP_ABORTED",
[XDP_DROP] = "XDP_DROP",
[XDP_PASS] = "XDP_PASS",
[XDP_TX] = "XDP_TX",
};
static const char *action2str(int action)
{
if (action < XDP_ACTION_MAX)
return xdp_action_names[action];
return NULL;
}
static int parse_xdp_action(char *action_str)
{
size_t maxlen;
__u64 action = -1;
int i;
for (i = 0; i < XDP_ACTION_MAX; i++) {
maxlen = XDP_ACTION_MAX_STRLEN;
if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
action = i;
break;
}
}
return action;
}
static void list_xdp_actions(void)
{
int i;
printf("Available XDP --action <options>\n");
for (i = 0; i < XDP_ACTION_MAX; i++)
printf("\t%s\n", xdp_action_names[i]);
printf("\n");
}
static void usage(char *argv[])
{
int i;
printf("\nDOCUMENTATION:\n%s\n", __doc__);
printf(" Usage: %s (options-see-below)\n", argv[0]);
printf(" Listing options:\n");
for (i = 0; long_options[i].name != 0; i++) {
printf(" --%-12s", long_options[i].name);
if (long_options[i].flag != NULL)
printf(" flag (internal value:%d)",
*long_options[i].flag);
else
printf(" short-option: -%c",
long_options[i].val);
printf("\n");
}
printf("\n");
list_xdp_actions();
}
#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
static __u64 gettime(void)
{
struct timespec t;
int res;
res = clock_gettime(CLOCK_MONOTONIC, &t);
if (res < 0) {
fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
exit(EXIT_FAIL);
}
return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
}
/* Common stats data record shared with _kern.c */
struct datarec {
__u64 processed;
__u64 issue;
};
struct record {
__u64 timestamp;
struct datarec total;
struct datarec *cpu;
};
struct stats_record {
struct record stats;
struct record *rxq;
};
static struct datarec *alloc_record_per_cpu(void)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
struct datarec *array;
size_t size;
size = sizeof(struct datarec) * nr_cpus;
array = malloc(size);
memset(array, 0, size);
if (!array) {
fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
exit(EXIT_FAIL_MEM);
}
return array;
}
static struct record *alloc_record_per_rxq(void)
{
unsigned int nr_rxqs = map_data[2].def.max_entries;
struct record *array;
size_t size;
size = sizeof(struct record) * nr_rxqs;
array = malloc(size);
memset(array, 0, size);
if (!array) {
fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
exit(EXIT_FAIL_MEM);
}
return array;
}
static struct stats_record *alloc_stats_record(void)
{
unsigned int nr_rxqs = map_data[2].def.max_entries;
struct stats_record *rec;
int i;
rec = malloc(sizeof(*rec));
memset(rec, 0, sizeof(*rec));
if (!rec) {
fprintf(stderr, "Mem alloc error\n");
exit(EXIT_FAIL_MEM);
}
rec->rxq = alloc_record_per_rxq();
for (i = 0; i < nr_rxqs; i++)
rec->rxq[i].cpu = alloc_record_per_cpu();
rec->stats.cpu = alloc_record_per_cpu();
return rec;
}
static void free_stats_record(struct stats_record *r)
{
unsigned int nr_rxqs = map_data[2].def.max_entries;
int i;
for (i = 0; i < nr_rxqs; i++)
free(r->rxq[i].cpu);
free(r->rxq);
free(r->stats.cpu);
free(r);
}
static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
{
/* For percpu maps, userspace gets a value per possible CPU */
unsigned int nr_cpus = bpf_num_possible_cpus();
struct datarec values[nr_cpus];
__u64 sum_processed = 0;
__u64 sum_issue = 0;
int i;
if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
fprintf(stderr,
"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
return false;
}
/* Get time as close as possible to reading map contents */
rec->timestamp = gettime();
/* Record and sum values from each CPU */
for (i = 0; i < nr_cpus; i++) {
rec->cpu[i].processed = values[i].processed;
sum_processed += values[i].processed;
rec->cpu[i].issue = values[i].issue;
sum_issue += values[i].issue;
}
rec->total.processed = sum_processed;
rec->total.issue = sum_issue;
return true;
}
static void stats_collect(struct stats_record *rec)
{
int fd, i, max_rxqs;
fd = map_data[1].fd; /* map: stats_global_map */
map_collect_percpu(fd, 0, &rec->stats);
fd = map_data[2].fd; /* map: rx_queue_index_map */
max_rxqs = map_data[2].def.max_entries;
for (i = 0; i < max_rxqs; i++)
map_collect_percpu(fd, i, &rec->rxq[i]);
}
static double calc_period(struct record *r, struct record *p)
{
double period_ = 0;
__u64 period = 0;
period = r->timestamp - p->timestamp;
if (period > 0)
period_ = ((double) period / NANOSEC_PER_SEC);
return period_;
}
static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
{
__u64 packets = 0;
__u64 pps = 0;
if (period_ > 0) {
packets = r->processed - p->processed;
pps = packets / period_;
}
return pps;
}
static __u64 calc_errs_pps(struct datarec *r,
struct datarec *p, double period_)
{
__u64 packets = 0;
__u64 pps = 0;
if (period_ > 0) {
packets = r->issue - p->issue;
pps = packets / period_;
}
return pps;
}
static void stats_print(struct stats_record *stats_rec,
struct stats_record *stats_prev,
int action)
{
unsigned int nr_cpus = bpf_num_possible_cpus();
unsigned int nr_rxqs = map_data[2].def.max_entries;
double pps = 0, err = 0;
struct record *rec, *prev;
double t;
int rxq;
int i;
/* Header */
printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n",
ifname, ifindex, action2str(action));
/* stats_global_map */
{
char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
char *fm2_rx = "%-15s %-7s %'-11.0f\n";
char *errstr = "";
printf("%-15s %-7s %-11s %-11s\n",
"XDP stats", "CPU", "pps", "issue-pps");
rec = &stats_rec->stats;
prev = &stats_prev->stats;
t = calc_period(rec, prev);
for (i = 0; i < nr_cpus; i++) {
struct datarec *r = &rec->cpu[i];
struct datarec *p = &prev->cpu[i];
pps = calc_pps (r, p, t);
err = calc_errs_pps(r, p, t);
if (err > 0)
errstr = "invalid-ifindex";
if (pps > 0)
printf(fmt_rx, "XDP-RX CPU",
i, pps, err, errstr);
}
pps = calc_pps (&rec->total, &prev->total, t);
err = calc_errs_pps(&rec->total, &prev->total, t);
printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
}
/* rx_queue_index_map */
printf("\n%-15s %-7s %-11s %-11s\n",
"RXQ stats", "RXQ:CPU", "pps", "issue-pps");
for (rxq = 0; rxq < nr_rxqs; rxq++) {
char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
char *errstr = "";
int rxq_ = rxq;
/* Last RXQ in map catch overflows */
if (rxq_ == nr_rxqs - 1)
rxq_ = -1;
rec = &stats_rec->rxq[rxq];
prev = &stats_prev->rxq[rxq];
t = calc_period(rec, prev);
for (i = 0; i < nr_cpus; i++) {
struct datarec *r = &rec->cpu[i];
struct datarec *p = &prev->cpu[i];
pps = calc_pps (r, p, t);
err = calc_errs_pps(r, p, t);
if (err > 0) {
if (rxq_ == -1)
errstr = "map-overflow-RXQ";
else
errstr = "err";
}
if (pps > 0)
printf(fmt_rx, "rx_queue_index",
rxq_, i, pps, err, errstr);
}
pps = calc_pps (&rec->total, &prev->total, t);
err = calc_errs_pps(&rec->total, &prev->total, t);
if (pps || err)
printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
}
}
/* Pointer swap trick */
static inline void swap(struct stats_record **a, struct stats_record **b)
{
struct stats_record *tmp;
tmp = *a;
*a = *b;
*b = tmp;
}
static void stats_poll(int interval, int action)
{
struct stats_record *record, *prev;
record = alloc_stats_record();
prev = alloc_stats_record();
stats_collect(record);
while (1) {
swap(&prev, &record);
stats_collect(record);
stats_print(record, prev, action);
sleep(interval);
}
free_stats_record(record);
free_stats_record(prev);
}
int main(int argc, char **argv)
{
struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
bool use_separators = true;
struct config cfg = { 0 };
char filename[256];
int longindex = 0;
int interval = 2;
__u32 key = 0;
int opt, err;
char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
int action = XDP_PASS; /* Default action */
char *action_str = NULL;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
if (load_bpf_file(filename)) {
fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
return EXIT_FAIL;
}
if (!prog_fd[0]) {
fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
return EXIT_FAIL;
}
/* Parse commands line args */
while ((opt = getopt_long(argc, argv, "hSd:",
long_options, &longindex)) != -1) {
switch (opt) {
case 'd':
if (strlen(optarg) >= IF_NAMESIZE) {
fprintf(stderr, "ERR: --dev name too long\n");
goto error;
}
ifname = (char *)&ifname_buf;
strncpy(ifname, optarg, IF_NAMESIZE);
ifindex = if_nametoindex(ifname);
if (ifindex == 0) {
fprintf(stderr,
"ERR: --dev name unknown err(%d):%s\n",
errno, strerror(errno));
goto error;
}
break;
case 's':
interval = atoi(optarg);
break;
case 'S':
xdp_flags |= XDP_FLAGS_SKB_MODE;
break;
case 'z':
use_separators = false;
break;
case 'a':
action_str = (char *)&action_str_buf;
strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
break;
case 'h':
error:
default:
usage(argv);
return EXIT_FAIL_OPTION;
}
}
/* Required option */
if (ifindex == -1) {
fprintf(stderr, "ERR: required option --dev missing\n");
usage(argv);
return EXIT_FAIL_OPTION;
}
cfg.ifindex = ifindex;
/* Parse action string */
if (action_str) {
action = parse_xdp_action(action_str);
if (action < 0) {
fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
action_str);
list_xdp_actions();
return EXIT_FAIL_OPTION;
}
}
cfg.action = action;
/* Trick to pretty printf with thousands separators use %' */
if (use_separators)
setlocale(LC_NUMERIC, "en_US");
/* User-side setup ifindex in config_map */
err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
if (err) {
fprintf(stderr, "Store config failed (err:%d)\n", err);
exit(EXIT_FAIL_BPF);
}
/* Remove XDP program when program is interrupted */
signal(SIGINT, int_exit);
if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
fprintf(stderr, "link set xdp fd failed\n");
return EXIT_FAIL_XDP;
}
stats_poll(interval, action);
return EXIT_OK;
}