[v4] net/mlx4: support hardware TSO
Checks
Commit Message
Implement support for hardware TSO.
Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com
v3:
* Fixed compilation errors in compilers without GNU C extensions
caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com
v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com
v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
doc/guides/nics/features/mlx4.ini | 1 +
doc/guides/nics/mlx4.rst | 3 +
drivers/net/mlx4/Makefile | 5 +
drivers/net/mlx4/mlx4.c | 9 +
drivers/net/mlx4/mlx4.h | 5 +
drivers/net/mlx4/mlx4_prm.h | 15 ++
drivers/net/mlx4/mlx4_rxtx.c | 362 +++++++++++++++++++++++++++++++++++++-
drivers/net/mlx4/mlx4_rxtx.h | 2 +-
drivers/net/mlx4/mlx4_txq.c | 8 +-
9 files changed, 406 insertions(+), 4 deletions(-)
Comments
HI Moti
Please see inline.
From: Mordechay Haimovsky
> Implement support for hardware TSO.
>
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> ---
> v4:
> * Bug fixes in filling TSO data segments.
> * Modifications according to review inputs from Adrien Mazarguil
> and Matan Azrad.
> in reply to
> 1530190137-17848-1-git-send-email-motih@mellanox.com
>
> v3:
> * Fixed compilation errors in compilers without GNU C extensions
> caused by a declaration of zero-length array in the code.
> in reply to
> 1530187032-6489-1-git-send-email-motih@mellanox.com
>
> v2:
> * Fixed coding style warning.
> in reply to
> 1530184583-30166-1-git-send-email-motih@mellanox.com
>
> v1:
> * Fixed coding style warnings.
> in reply to
> 1530181779-19716-1-git-send-email-motih@mellanox.com
> ---
> doc/guides/nics/features/mlx4.ini | 1 +
> doc/guides/nics/mlx4.rst | 3 +
> drivers/net/mlx4/Makefile | 5 +
> drivers/net/mlx4/mlx4.c | 9 +
> drivers/net/mlx4/mlx4.h | 5 +
> drivers/net/mlx4/mlx4_prm.h | 15 ++
> drivers/net/mlx4/mlx4_rxtx.c | 362
> +++++++++++++++++++++++++++++++++++++-
> drivers/net/mlx4/mlx4_rxtx.h | 2 +-
> drivers/net/mlx4/mlx4_txq.c | 8 +-
> 9 files changed, 406 insertions(+), 4 deletions(-)
>
> diff --git a/doc/guides/nics/features/mlx4.ini
> b/doc/guides/nics/features/mlx4.ini
> index f6efd21..98a3f61 100644
> --- a/doc/guides/nics/features/mlx4.ini
> +++ b/doc/guides/nics/features/mlx4.ini
> @@ -13,6 +13,7 @@ Queue start/stop = Y
> MTU update = Y
> Jumbo frame = Y
> Scattered Rx = Y
> +TSO = Y
> Promiscuous mode = Y
> Allmulticast mode = Y
> Unicast MAC filter = Y
> diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> 491106a..12adaeb 100644
> --- a/doc/guides/nics/mlx4.rst
> +++ b/doc/guides/nics/mlx4.rst
> @@ -142,6 +142,9 @@ Limitations
> The ability to enable/disable CRC stripping requires OFED version
> 4.3-1.5.0.0 and above or rdma-core version v18 and above.
>
> +- TSO (Transmit Segmentation Offload) is supported in OFED version
> + 4.4 and above or in rdma-core version v18 and above.
> +
> Prerequisites
> -------------
>
> diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index
> 73f9d40..63bc003 100644
> --- a/drivers/net/mlx4/Makefile
> +++ b/drivers/net/mlx4/Makefile
> @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
> mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
> $Q $(RM) -f -- '$@'
> $Q : > '$@'
> + $Q sh -- '$<' '$@' \
> + HAVE_IBV_MLX4_WQE_LSO_SEG \
> + infiniband/mlx4dv.h \
> + type 'struct mlx4_wqe_lso_seg' \
> + $(AUTOCONF_OUTPUT)
>
> # Create mlx4_autoconf.h or update it in case it differs from the new one.
>
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> d151a90..5d8c76d 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -677,6 +677,15 @@ struct mlx4_conf {
>
> IBV_RAW_PACKET_CAP_SCATTER_FCS);
> DEBUG("FCS stripping toggling is %ssupported",
> priv->hw_fcs_strip ? "" : "not ");
> + priv->tso =
> + ((device_attr_ex.tso_caps.max_tso > 0) &&
> + (device_attr_ex.tso_caps.supported_qpts &
> + (1 << IBV_QPT_RAW_PACKET)));
> + if (priv->tso)
> + priv->tso_max_payload_sz =
> + device_attr_ex.tso_caps.max_tso;
> + DEBUG("TSO is %ssupported",
> + priv->tso ? "" : "not ");
> /* Configure the first MAC address by default. */
> err = mlx4_get_mac(priv, &mac.addr_bytes);
> if (err) {
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> 300cb4d..89d8c38 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -47,6 +47,9 @@
> /** Interrupt alarm timeout value in microseconds. */ #define
> MLX4_INTR_ALARM_TIMEOUT 100000
>
> +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> +MLX4_MAX_TSO_HEADER 192
> +
> /** Port parameter. */
> #define MLX4_PMD_PORT_KVARG "port"
>
> @@ -90,6 +93,8 @@ struct priv {
> uint32_t hw_csum:1; /**< Checksum offload is supported. */
> uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> */
> uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
> + uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> + uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> size. */
> uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format). */
> struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
> struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index b771d8c..aef77ba 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -19,6 +19,7 @@
> #ifdef PEDANTIC
> #pragma GCC diagnostic error "-Wpedantic"
> #endif
> +#include "mlx4_autoconf.h"
>
> /* ConnectX-3 Tx queue basic block. */
> #define MLX4_TXBB_SHIFT 6
> @@ -40,6 +41,7 @@
> /* Work queue element (WQE) flags. */
> #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28) #define
> MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> +#define MLX4_WQE_CTRL_RR (1 << 6)
>
> /* CQE checksum flags. */
> enum {
> @@ -98,6 +100,19 @@ struct mlx4_cq {
> int arm_sn; /**< Rx event counter. */
> };
>
> +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> +/*
> + * WQE LSO segment structure.
> + * Defined here as backward compatibility for rdma-core v17 and below.
> + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
> + * and above.
> + */
> +struct mlx4_wqe_lso_seg {
> + rte_be32_t mss_hdr_size;
> + rte_be32_t header[];
> +};
> +#endif
> +
> /**
> * Retrieve a CQE entry from a CQ.
> *
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 78b6dd5..750ad6d 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -38,10 +38,29 @@
> * DWORD (32 byte) of a TXBB.
> */
> struct pv {
> - volatile struct mlx4_wqe_data_seg *dseg;
> + union {
> + volatile struct mlx4_wqe_data_seg *dseg;
> + volatile uint32_t *dst;
> + };
> uint32_t val;
> };
>
> +/** A helper structure for TSO packet handling. */ struct tso_info {
> + /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> + struct pv *pv;
> + /** Current entry in the pv array. */
> + int pv_counter;
> + /** Total size of the WQE including padding. */
> + uint32_t wqe_size;
> + /** size of TSO header to prepend to each packet to send. */
size => Size
> + uint16_t tso_header_sz;
tso_header_sz = > t tso_header_size
"size" like the next fields name.
> + /** Total size of the TSO segment in the WQE. */
> + uint16_t wqe_tso_seg_size;
> + /** Raw WQE size in units of 16 Bytes and without padding. */
> + uint8_t fence_size;
> +};
> +
> /** A table to translate Rx completion flags to packet type. */ uint32_t
> mlx4_ptype_table[0x100] __rte_cache_aligned = {
> /*
> @@ -368,6 +387,335 @@ struct pv {
> }
>
> /**
> + * Obtain and calculate TSO information needed for assembling a TSO WQE.
> + *
> + * @param buf
> + * Pointer to the first packet mbuf.
> + * @param txq
> + * Pointer to Tx queue structure.
> + * @param tinfo
> + * Pointer to a structure to fill the info with.
> + *
> + * @return
> + * 0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> + struct txq *txq,
> + struct tso_info *tinfo)
> +{
> + struct mlx4_sq *sq = &txq->msq;
> + const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> + (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> + tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
> + if (tunneled)
> + tinfo->tso_header_sz += buf->outer_l2_len + buf-
> >outer_l3_len;
> + if (unlikely(buf->tso_segsz == 0 ||
> + tinfo->tso_header_sz == 0 ||
> + tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER ||
> + tinfo->tso_header_sz > buf->data_len))
> + return -EINVAL;
> + /*
> + * Calculate the WQE TSO segment size
> + * Note:
> + * 1. An LSO segment must be padded such that the subsequent data
> + * segment is 16-byte aligned.
> + * 2. The start address of the TSO segment is always 16 Bytes aligned.
> + */
> + tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) + tinfo->tso_header_sz, sizeof(struct mlx4_wqe_data_seg));
> + tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> + tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> + buf->nb_segs;
> + tinfo->wqe_size =
> + RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> MLX4_SEG_SHIFT),
> + MLX4_TXBB_SIZE);
> + /* Validate WQE size and WQE space in the send queue. */
> + if (sq->remain_size < tinfo->wqe_size ||
> + tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> + return -ENOMEM;
> + /* Init pv. */
> + tinfo->pv = (struct pv *)txq->bounce_buf;
> + tinfo->pv_counter = 0;
> + return 0;
> +}
> +
> +/**
> + * Fill the TSO WQE data segments with info on buffers to transmit .
> + *
> + * @param buf
> + * Pointer to the first packet mbuf.
> + * @param txq
> + * Pointer to Tx queue structure.
> + * @param tinfo
> + * Pointer to TSO info to use.
> + * @param dseg
> + * Pointer to the first data segment in the TSO WQE.
> + *
> + * @return
> + * 0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_ctrl_seg *
> +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> + struct txq *txq,
> + struct tso_info *tinfo,
> + volatile struct mlx4_wqe_data_seg *dseg,
> + volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> + uint32_t lkey;
> + int nb_segs = buf->nb_segs;
> + int nb_segs_txbb;
> + struct mlx4_sq *sq = &txq->msq;
> + struct rte_mbuf *sbuf = buf;
> + struct pv *pv = tinfo->pv;
> + int *pv_counter = &tinfo->pv_counter;
> + uint16_t sb_of = tinfo->tso_header_sz;
> + uint16_t data_len;
> +
> + while (nb_segs > 0) {
I think that here do while statement is better(no need the check in the first loop).
> + /* how many dseg entries do we have in the current TXBB ?
> */
> + nb_segs_txbb =
> + (MLX4_TXBB_SIZE / sizeof(struct
> mlx4_wqe_data_seg)) -
> + ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
> + sizeof(struct mlx4_wqe_data_seg);
Division may be expensive, you can avoid it by next:
nb_segs_txbb = (MLX4_TXBB_SIZE - ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> MLX4_SEG_SHIFT;
> + switch (nb_segs_txbb) {
> + case 4:
> + /* Memory region key for this memory pool. */
> + lkey = mlx4_tx_mb2mr(txq, sbuf);
> + if (unlikely(lkey == (uint32_t)-1))
> + goto lkey_err;
> + dseg->addr =
> +
> rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> + uintptr_t,
> + sb_of));
> + dseg->lkey = lkey;
> + /*
> + * This data segment starts at the beginning of a new
> + * TXBB, so we need to postpone its byte_count
> writing
> + * for later.
> + */
> + pv[*pv_counter].dseg = dseg;
> + /*
> + * Zero length segment is treated as inline segment
> + * with zero data.
> + */
> + data_len = sbuf->data_len - sb_of;
> + pv[(*pv_counter)++].val =
> + rte_cpu_to_be_32(data_len ?
> + data_len :
> + 0x80000000);
> + sb_of = 0;
> + sbuf = sbuf->next;
> + dseg++;
> + if (--nb_segs == 0)
> + break;
I think that here and in all the other cases it is better to do " return X" instead of break.
X is the same return value as now which can be calculated in the start.
> + /* fallthrough */
> + case 3:
> + lkey = mlx4_tx_mb2mr(txq, sbuf);
> + if (unlikely(lkey == (uint32_t)-1))
> + goto lkey_err;
> + data_len = sbuf->data_len - sb_of;
> + mlx4_fill_tx_data_seg(dseg,
> + lkey,
> + rte_pktmbuf_mtod_offset(sbuf,
> + uintptr_t,
> + sb_of),
> + rte_cpu_to_be_32(data_len ?
> + data_len :
> + 0x80000000));
> + sb_of = 0;
> + sbuf = sbuf->next;
> + dseg++;
> + if (--nb_segs == 0)
> + break;
> + /* fallthrough */
> + case 2:
> + lkey = mlx4_tx_mb2mr(txq, sbuf);
> + if (unlikely(lkey == (uint32_t)-1))
> + goto lkey_err;
> + data_len = sbuf->data_len - sb_of;
> + mlx4_fill_tx_data_seg(dseg,
> + lkey,
> + rte_pktmbuf_mtod_offset(sbuf,
> + uintptr_t,
> + sb_of),
> + rte_cpu_to_be_32(data_len ?
> + data_len :
> + 0x80000000));
> + sb_of = 0;
> + sbuf = sbuf->next;
> + dseg++;
> + if (--nb_segs == 0)
> + break;
> + /* fallthrough */
> + case 1:
> + lkey = mlx4_tx_mb2mr(txq, sbuf);
> + if (unlikely(lkey == (uint32_t)-1))
> + goto lkey_err;
> + data_len = sbuf->data_len - sb_of;
> + mlx4_fill_tx_data_seg(dseg,
> + lkey,
> + rte_pktmbuf_mtod_offset(sbuf,
> + uintptr_t,
> + sb_of),
> + rte_cpu_to_be_32(data_len ?
> + data_len :
> + 0x80000000));
> + sb_of = 0;
> + sbuf = sbuf->next;
> + dseg++;
> + --nb_segs;
> + break;
> + default:
> + /* Should never happen */
> + rte_panic("%p: Invalid number of SGEs(%d) for a
> TXBB",
> + (void *)txq, nb_segs_txbb);
I think we don't need the default case here, Do you have any scenario it may really happen?
> + }
> + /* Wrap dseg if it points at the end of the queue. */
> + if ((volatile uint8_t *)dseg >= sq->eob)
> + dseg = (volatile struct mlx4_wqe_data_seg *)
> + ((volatile uint8_t *)dseg - sq->size);
> + }
> + /* Align next WQE address to the next TXBB. */
> + return (volatile struct mlx4_wqe_ctrl_seg *)
> + ((volatile uint8_t *)ctrl + tinfo->wqe_size);
> +lkey_err:
> + return NULL;
> +}
> +
> +/**
> + * Fill the packet's l2, l3 and l4 headers to the WQE.
> + *
> + * This will be used as the header for each TSO segment that is transmitted.
> + *
> + * @param buf
> + * Pointer to the first packet mbuf.
> + * @param txq
> + * Pointer to Tx queue structure.
> + * @param tinfo
> + * Pointer to TSO info to use.
> + * @param ctrl
> + * Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + * 0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_data_seg *
> +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> + struct txq *txq,
> + struct tso_info *tinfo,
> + volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> + volatile struct mlx4_wqe_lso_seg *tseg =
> + (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> + struct mlx4_sq *sq = &txq->msq;
> + struct pv *pv = tinfo->pv;
> + int *pv_counter = &tinfo->pv_counter;
> + int remain_sz = tinfo->tso_header_sz;
> + char *from = rte_pktmbuf_mtod(buf, char *);
> + uint16_t txbb_avail_space;
> + int copy_sz;
> + /* Union to overcome volatile constraints when copying TSO header.
> */
> + union {
> + volatile uint8_t *vto;
> + uint8_t *to;
> + } thdr = { .vto = (volatile uint8_t *)tseg->header, };
> +
> + /*
> + * TSO data always starts at offset 20 from the beginning of the TXBB
> + * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> + * we can write the first 44 TSO header bytes without worry for TxQ
> + * wrapping or overwriting the first TXBB 32bit word.
> + */
> + txbb_avail_space = MLX4_TXBB_SIZE -
> + (sizeof(struct mlx4_wqe_ctrl_seg) +
> + sizeof(struct mlx4_wqe_lso_seg));
> + do {
> + copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
> + rte_memcpy(thdr.to, from, copy_sz);
> + remain_sz -= copy_sz;
> + if (remain_sz <= 0)
> + break;
> + from += copy_sz;
> + thdr.to += copy_sz;
> + /* New TXBB, Check for TxQ wrap. */
> + if (thdr.to >= sq->eob)
> + thdr.vto = sq->buf;
> + /* New TXBB, stash the first 32bits for later use. */
> + pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
> + rte_memcpy(&pv[*pv_counter].val, from,
> + RTE_MIN((size_t)remain_sz, sizeof(uint32_t)));
> + (*pv_counter)++;
> + from += sizeof(uint32_t);
> + thdr.to += sizeof(uint32_t);
> + remain_sz -= sizeof(uint32_t);
> + /* Space in current TXBB is TXBB size - 4 */
> + txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> + } while (remain_sz > 0);
I think the loop can be better - you have now 5 checks per txbb, we can reduce it to 2 by next:
txbb_data_space = 44; (not include the first 4 bytes of the current txbb)
while (remain_size >= txbb_data_space + 4) // loop to write the tail of the current txbb + the head of the next txbb
write txbb_data_space to the WQE.
Check wrap around
Write 4 bytes for the next txbb to pv
remain_size -= txbb_data_space + 4
txbb_data_space =60
if (remain_size > txbb_data_space) // write tail and partially head
write txbb_data_space to the WQE.
Check wrap around
Write (remain_size - txbb_data_space) to pv
Else // write only tail
write remain_size from the header
Am I missing something?
> + tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> + tinfo->tso_header_sz);
> + /* Calculate data segment location */
> + return (volatile struct mlx4_wqe_data_seg *)
> + ((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> }
> +
> +/**
> + * Write data segments and header for TSO uni/multi segment packet.
> + *
> + * @param buf
> + * Pointer to the first packet mbuf.
> + * @param txq
> + * Pointer to Tx queue structure.
> + * @param ctrl
> + * Pointer to the WQE control segment.
> + *
> + * @return
> + * Pointer to the next WQE control segment on success, NULL otherwise.
> + */
> +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> +rte_mbuf *buf, struct txq *txq,
> + volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> + volatile struct mlx4_wqe_data_seg *dseg;
> + volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> + struct mlx4_sq *sq = &txq->msq;
> + struct tso_info tinfo;
> + struct pv *pv;
> + int pv_counter;
> + int ret;
> +
> + ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> + if (unlikely(ret))
> + goto error;
> + dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> + if (unlikely(dseg == NULL))
> + goto error;
> + if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> + dseg = (volatile struct mlx4_wqe_data_seg *)
> + ((uintptr_t)dseg - sq->size);
> + ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> + if (unlikely(ctrl_next == NULL))
> + goto error;
> + /* Write the first DWORD of each TXBB save earlier. */
> + if (tinfo.pv_counter) {
I think you can add here likely:
The minimum segments:
1. cntrl
2. header eth
3. header IP
4. header IP\tcp
5. at least 1 data segment.
Maybe even we don't need this check.
> + pv = tinfo.pv;
> + pv_counter = tinfo.pv_counter;
> + /* Need a barrier here before writing the first TXBB word. */
> + rte_io_wmb();
> + for (--pv_counter; pv_counter >= 0; pv_counter--)
> + *pv[pv_counter].dst = pv[pv_counter].val;
> + }
> + ctrl->fence_size = tinfo.fence_size;
> + sq->remain_size -= tinfo.wqe_size;
> + return ctrl_next;
> +error:
> + txq->stats.odropped++;
> + return NULL;
> +}
> +
> +/**
> * Write data segments of multi-segment packet.
> *
> * @param buf
> @@ -560,6 +908,7 @@ struct pv {
> uint16_t flags16[2];
> } srcrb;
> uint32_t lkey;
> + bool tso = txq->priv->tso && (buf->ol_flags &
> PKT_TX_TCP_SEG);
>
> /* Clean up old buffer. */
> if (likely(elt->buf != NULL)) {
> @@ -578,7 +927,16 @@ struct pv {
> } while (tmp != NULL);
> }
> RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> - if (buf->nb_segs == 1) {
> + if (tso) {
> + /* Change opcode to TSO */
> + owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> + owner_opcode |= MLX4_OPCODE_LSO |
> MLX4_WQE_CTRL_RR;
> + ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> + if (!ctrl_next) {
> + elt->buf = NULL;
> + break;
> + }
> + } else if (buf->nb_segs == 1) {
> /* Validate WQE space in the send queue. */
> if (sq->remain_size < MLX4_TXBB_SIZE) {
> elt->buf = NULL;
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 4c025e3..ffa8abf 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
> unsigned int idx; /**< Mapping index. */
> uint64_t opackets; /**< Total of successfully sent packets. */
> uint64_t obytes; /**< Total of successfully sent bytes. */
> - uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> */
> + uint64_t odropped; /**< Total number of packets failed to transmit.
> */
> };
>
> /** Tx queue descriptor. */
> diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> index 6edaadb..9aa7440 100644
> --- a/drivers/net/mlx4/mlx4_txq.c
> +++ b/drivers/net/mlx4/mlx4_txq.c
> @@ -116,8 +116,14 @@
> DEV_TX_OFFLOAD_UDP_CKSUM |
> DEV_TX_OFFLOAD_TCP_CKSUM);
> }
> - if (priv->hw_csum_l2tun)
> + if (priv->tso)
> + offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> + if (priv->hw_csum_l2tun) {
> offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> + if (priv->tso)
> + offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> + DEV_TX_OFFLOAD_GRE_TNL_TSO);
> + }
> return offloads;
> }
>
> --
> 1.8.3.1
@@ -13,6 +13,7 @@ Queue start/stop = Y
MTU update = Y
Jumbo frame = Y
Scattered Rx = Y
+TSO = Y
Promiscuous mode = Y
Allmulticast mode = Y
Unicast MAC filter = Y
@@ -142,6 +142,9 @@ Limitations
The ability to enable/disable CRC stripping requires OFED version
4.3-1.5.0.0 and above or rdma-core version v18 and above.
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+ 4.4 and above or in rdma-core version v18 and above.
+
Prerequisites
-------------
@@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
$Q $(RM) -f -- '$@'
$Q : > '$@'
+ $Q sh -- '$<' '$@' \
+ HAVE_IBV_MLX4_WQE_LSO_SEG \
+ infiniband/mlx4dv.h \
+ type 'struct mlx4_wqe_lso_seg' \
+ $(AUTOCONF_OUTPUT)
# Create mlx4_autoconf.h or update it in case it differs from the new one.
@@ -677,6 +677,15 @@ struct mlx4_conf {
IBV_RAW_PACKET_CAP_SCATTER_FCS);
DEBUG("FCS stripping toggling is %ssupported",
priv->hw_fcs_strip ? "" : "not ");
+ priv->tso =
+ ((device_attr_ex.tso_caps.max_tso > 0) &&
+ (device_attr_ex.tso_caps.supported_qpts &
+ (1 << IBV_QPT_RAW_PACKET)));
+ if (priv->tso)
+ priv->tso_max_payload_sz =
+ device_attr_ex.tso_caps.max_tso;
+ DEBUG("TSO is %ssupported",
+ priv->tso ? "" : "not ");
/* Configure the first MAC address by default. */
err = mlx4_get_mac(priv, &mac.addr_bytes);
if (err) {
@@ -47,6 +47,9 @@
/** Interrupt alarm timeout value in microseconds. */
#define MLX4_INTR_ALARM_TIMEOUT 100000
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
@@ -90,6 +93,8 @@ struct priv {
uint32_t hw_csum:1; /**< Checksum offload is supported. */
uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+ uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+ uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
@@ -19,6 +19,7 @@
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
+#include "mlx4_autoconf.h"
/* ConnectX-3 Tx queue basic block. */
#define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@
/* Work queue element (WQE) flags. */
#define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
#define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
/* CQE checksum flags. */
enum {
@@ -98,6 +100,19 @@ struct mlx4_cq {
int arm_sn; /**< Rx event counter. */
};
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+ rte_be32_t mss_hdr_size;
+ rte_be32_t header[];
+};
+#endif
+
/**
* Retrieve a CQE entry from a CQ.
*
@@ -38,10 +38,29 @@
* DWORD (32 byte) of a TXBB.
*/
struct pv {
- volatile struct mlx4_wqe_data_seg *dseg;
+ union {
+ volatile struct mlx4_wqe_data_seg *dseg;
+ volatile uint32_t *dst;
+ };
uint32_t val;
};
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+ /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+ struct pv *pv;
+ /** Current entry in the pv array. */
+ int pv_counter;
+ /** Total size of the WQE including padding. */
+ uint32_t wqe_size;
+ /** size of TSO header to prepend to each packet to send. */
+ uint16_t tso_header_sz;
+ /** Total size of the TSO segment in the WQE. */
+ uint16_t wqe_tso_seg_size;
+ /** Raw WQE size in units of 16 Bytes and without padding. */
+ uint8_t fence_size;
+};
+
/** A table to translate Rx completion flags to packet type. */
uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
/*
@@ -368,6 +387,335 @@ struct pv {
}
/**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tinfo
+ * Pointer to a structure to fill the info with.
+ *
+ * @return
+ * 0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct tso_info *tinfo)
+{
+ struct mlx4_sq *sq = &txq->msq;
+ const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+ (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+ tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
+ if (tunneled)
+ tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+ if (unlikely(buf->tso_segsz == 0 ||
+ tinfo->tso_header_sz == 0 ||
+ tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER ||
+ tinfo->tso_header_sz > buf->data_len))
+ return -EINVAL;
+ /*
+ * Calculate the WQE TSO segment size
+ * Note:
+ * 1. An LSO segment must be padded such that the subsequent data
+ * segment is 16-byte aligned.
+ * 2. The start address of the TSO segment is always 16 Bytes aligned.
+ */
+ tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+ tinfo->tso_header_sz,
+ sizeof(struct mlx4_wqe_data_seg));
+ tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+ tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+ buf->nb_segs;
+ tinfo->wqe_size =
+ RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+ MLX4_TXBB_SIZE);
+ /* Validate WQE size and WQE space in the send queue. */
+ if (sq->remain_size < tinfo->wqe_size ||
+ tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+ return -ENOMEM;
+ /* Init pv. */
+ tinfo->pv = (struct pv *)txq->bounce_buf;
+ tinfo->pv_counter = 0;
+ return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tinfo
+ * Pointer to TSO info to use.
+ * @param dseg
+ * Pointer to the first data segment in the TSO WQE.
+ *
+ * @return
+ * 0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct tso_info *tinfo,
+ volatile struct mlx4_wqe_data_seg *dseg,
+ volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+ uint32_t lkey;
+ int nb_segs = buf->nb_segs;
+ int nb_segs_txbb;
+ struct mlx4_sq *sq = &txq->msq;
+ struct rte_mbuf *sbuf = buf;
+ struct pv *pv = tinfo->pv;
+ int *pv_counter = &tinfo->pv_counter;
+ uint16_t sb_of = tinfo->tso_header_sz;
+ uint16_t data_len;
+
+ while (nb_segs > 0) {
+ /* how many dseg entries do we have in the current TXBB ? */
+ nb_segs_txbb =
+ (MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
+ ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
+ sizeof(struct mlx4_wqe_data_seg);
+ switch (nb_segs_txbb) {
+ case 4:
+ /* Memory region key for this memory pool. */
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto lkey_err;
+ dseg->addr =
+ rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+ uintptr_t,
+ sb_of));
+ dseg->lkey = lkey;
+ /*
+ * This data segment starts at the beginning of a new
+ * TXBB, so we need to postpone its byte_count writing
+ * for later.
+ */
+ pv[*pv_counter].dseg = dseg;
+ /*
+ * Zero length segment is treated as inline segment
+ * with zero data.
+ */
+ data_len = sbuf->data_len - sb_of;
+ pv[(*pv_counter)++].val =
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000);
+ sb_of = 0;
+ sbuf = sbuf->next;
+ dseg++;
+ if (--nb_segs == 0)
+ break;
+ /* fallthrough */
+ case 3:
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto lkey_err;
+ data_len = sbuf->data_len - sb_of;
+ mlx4_fill_tx_data_seg(dseg,
+ lkey,
+ rte_pktmbuf_mtod_offset(sbuf,
+ uintptr_t,
+ sb_of),
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000));
+ sb_of = 0;
+ sbuf = sbuf->next;
+ dseg++;
+ if (--nb_segs == 0)
+ break;
+ /* fallthrough */
+ case 2:
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto lkey_err;
+ data_len = sbuf->data_len - sb_of;
+ mlx4_fill_tx_data_seg(dseg,
+ lkey,
+ rte_pktmbuf_mtod_offset(sbuf,
+ uintptr_t,
+ sb_of),
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000));
+ sb_of = 0;
+ sbuf = sbuf->next;
+ dseg++;
+ if (--nb_segs == 0)
+ break;
+ /* fallthrough */
+ case 1:
+ lkey = mlx4_tx_mb2mr(txq, sbuf);
+ if (unlikely(lkey == (uint32_t)-1))
+ goto lkey_err;
+ data_len = sbuf->data_len - sb_of;
+ mlx4_fill_tx_data_seg(dseg,
+ lkey,
+ rte_pktmbuf_mtod_offset(sbuf,
+ uintptr_t,
+ sb_of),
+ rte_cpu_to_be_32(data_len ?
+ data_len :
+ 0x80000000));
+ sb_of = 0;
+ sbuf = sbuf->next;
+ dseg++;
+ --nb_segs;
+ break;
+ default:
+ /* Should never happen */
+ rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+ (void *)txq, nb_segs_txbb);
+ }
+ /* Wrap dseg if it points at the end of the queue. */
+ if ((volatile uint8_t *)dseg >= sq->eob)
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ ((volatile uint8_t *)dseg - sq->size);
+ }
+ /* Align next WQE address to the next TXBB. */
+ return (volatile struct mlx4_wqe_ctrl_seg *)
+ ((volatile uint8_t *)ctrl + tinfo->wqe_size);
+lkey_err:
+ return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tinfo
+ * Pointer to TSO info to use.
+ * @param ctrl
+ * Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ * 0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct tso_info *tinfo,
+ volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+ volatile struct mlx4_wqe_lso_seg *tseg =
+ (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+ struct mlx4_sq *sq = &txq->msq;
+ struct pv *pv = tinfo->pv;
+ int *pv_counter = &tinfo->pv_counter;
+ int remain_sz = tinfo->tso_header_sz;
+ char *from = rte_pktmbuf_mtod(buf, char *);
+ uint16_t txbb_avail_space;
+ int copy_sz;
+ /* Union to overcome volatile constraints when copying TSO header. */
+ union {
+ volatile uint8_t *vto;
+ uint8_t *to;
+ } thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+ /*
+ * TSO data always starts at offset 20 from the beginning of the TXBB
+ * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+ * we can write the first 44 TSO header bytes without worry for TxQ
+ * wrapping or overwriting the first TXBB 32bit word.
+ */
+ txbb_avail_space = MLX4_TXBB_SIZE -
+ (sizeof(struct mlx4_wqe_ctrl_seg) +
+ sizeof(struct mlx4_wqe_lso_seg));
+ do {
+ copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+ rte_memcpy(thdr.to, from, copy_sz);
+ remain_sz -= copy_sz;
+ if (remain_sz <= 0)
+ break;
+ from += copy_sz;
+ thdr.to += copy_sz;
+ /* New TXBB, Check for TxQ wrap. */
+ if (thdr.to >= sq->eob)
+ thdr.vto = sq->buf;
+ /* New TXBB, stash the first 32bits for later use. */
+ pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
+ rte_memcpy(&pv[*pv_counter].val, from,
+ RTE_MIN((size_t)remain_sz, sizeof(uint32_t)));
+ (*pv_counter)++;
+ from += sizeof(uint32_t);
+ thdr.to += sizeof(uint32_t);
+ remain_sz -= sizeof(uint32_t);
+ /* Space in current TXBB is TXBB size - 4 */
+ txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+ } while (remain_sz > 0);
+ tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+ tinfo->tso_header_sz);
+ /* Calculate data segment location */
+ return (volatile struct mlx4_wqe_data_seg *)
+ ((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ * Pointer to the first packet mbuf.
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param ctrl
+ * Pointer to the WQE control segment.
+ *
+ * @return
+ * Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+ volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+ volatile struct mlx4_wqe_data_seg *dseg;
+ volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+ struct mlx4_sq *sq = &txq->msq;
+ struct tso_info tinfo;
+ struct pv *pv;
+ int pv_counter;
+ int ret;
+
+ ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+ if (unlikely(ret))
+ goto error;
+ dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+ if (unlikely(dseg == NULL))
+ goto error;
+ if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+ dseg = (volatile struct mlx4_wqe_data_seg *)
+ ((uintptr_t)dseg - sq->size);
+ ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+ if (unlikely(ctrl_next == NULL))
+ goto error;
+ /* Write the first DWORD of each TXBB save earlier. */
+ if (tinfo.pv_counter) {
+ pv = tinfo.pv;
+ pv_counter = tinfo.pv_counter;
+ /* Need a barrier here before writing the first TXBB word. */
+ rte_io_wmb();
+ for (--pv_counter; pv_counter >= 0; pv_counter--)
+ *pv[pv_counter].dst = pv[pv_counter].val;
+ }
+ ctrl->fence_size = tinfo.fence_size;
+ sq->remain_size -= tinfo.wqe_size;
+ return ctrl_next;
+error:
+ txq->stats.odropped++;
+ return NULL;
+}
+
+/**
* Write data segments of multi-segment packet.
*
* @param buf
@@ -560,6 +908,7 @@ struct pv {
uint16_t flags16[2];
} srcrb;
uint32_t lkey;
+ bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
/* Clean up old buffer. */
if (likely(elt->buf != NULL)) {
@@ -578,7 +927,16 @@ struct pv {
} while (tmp != NULL);
}
RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- if (buf->nb_segs == 1) {
+ if (tso) {
+ /* Change opcode to TSO */
+ owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+ owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+ ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+ if (!ctrl_next) {
+ elt->buf = NULL;
+ break;
+ }
+ } else if (buf->nb_segs == 1) {
/* Validate WQE space in the send queue. */
if (sq->remain_size < MLX4_TXBB_SIZE) {
elt->buf = NULL;
@@ -90,7 +90,7 @@ struct mlx4_txq_stats {
unsigned int idx; /**< Mapping index. */
uint64_t opackets; /**< Total of successfully sent packets. */
uint64_t obytes; /**< Total of successfully sent bytes. */
- uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+ uint64_t odropped; /**< Total number of packets failed to transmit. */
};
/** Tx queue descriptor. */
@@ -116,8 +116,14 @@
DEV_TX_OFFLOAD_UDP_CKSUM |
DEV_TX_OFFLOAD_TCP_CKSUM);
}
- if (priv->hw_csum_l2tun)
+ if (priv->tso)
+ offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+ if (priv->hw_csum_l2tun) {
offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+ if (priv->tso)
+ offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO);
+ }
return offloads;
}