[v4] net/mlx4: support hardware TSO

Message ID 1530715998-15703-1-git-send-email-motih@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series [v4] net/mlx4: support hardware TSO |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Moti Haimovsky July 4, 2018, 2:53 p.m. UTC
  Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
  and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com

v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/Makefile         |   5 +
 drivers/net/mlx4/mlx4.c           |   9 +
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  15 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 362 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 9 files changed, 406 insertions(+), 4 deletions(-)
  

Comments

Matan Azrad July 5, 2018, 12:30 p.m. UTC | #1
HI Moti

Please see inline.

From: Mordechay Haimovsky
> Implement support for hardware TSO.
> 
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> ---
> v4:
> * Bug fixes in filling TSO data segments.
> * Modifications according to review inputs from Adrien Mazarguil
>   and Matan Azrad.
> in reply to
> 1530190137-17848-1-git-send-email-motih@mellanox.com
> 
> v3:
> * Fixed compilation errors in compilers without GNU C extensions
>   caused by a declaration of zero-length array in the code.
> in reply to
> 1530187032-6489-1-git-send-email-motih@mellanox.com
> 
> v2:
> * Fixed coding style warning.
> in reply to
> 1530184583-30166-1-git-send-email-motih@mellanox.com
> 
> v1:
> * Fixed coding style warnings.
> in reply to
> 1530181779-19716-1-git-send-email-motih@mellanox.com
> ---
>  doc/guides/nics/features/mlx4.ini |   1 +
>  doc/guides/nics/mlx4.rst          |   3 +
>  drivers/net/mlx4/Makefile         |   5 +
>  drivers/net/mlx4/mlx4.c           |   9 +
>  drivers/net/mlx4/mlx4.h           |   5 +
>  drivers/net/mlx4/mlx4_prm.h       |  15 ++
>  drivers/net/mlx4/mlx4_rxtx.c      | 362
> +++++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
>  drivers/net/mlx4/mlx4_txq.c       |   8 +-
>  9 files changed, 406 insertions(+), 4 deletions(-)
> 
> diff --git a/doc/guides/nics/features/mlx4.ini
> b/doc/guides/nics/features/mlx4.ini
> index f6efd21..98a3f61 100644
> --- a/doc/guides/nics/features/mlx4.ini
> +++ b/doc/guides/nics/features/mlx4.ini
> @@ -13,6 +13,7 @@ Queue start/stop     = Y
>  MTU update           = Y
>  Jumbo frame          = Y
>  Scattered Rx         = Y
> +TSO                  = Y
>  Promiscuous mode     = Y
>  Allmulticast mode    = Y
>  Unicast MAC filter   = Y
> diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> 491106a..12adaeb 100644
> --- a/doc/guides/nics/mlx4.rst
> +++ b/doc/guides/nics/mlx4.rst
> @@ -142,6 +142,9 @@ Limitations
>    The ability to enable/disable CRC stripping requires OFED version
>    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
> 
> +- TSO (Transmit Segmentation Offload) is supported in OFED version
> +  4.4 and above or in rdma-core version v18 and above.
> +
>  Prerequisites
>  -------------
> 
> diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index
> 73f9d40..63bc003 100644
> --- a/drivers/net/mlx4/Makefile
> +++ b/drivers/net/mlx4/Makefile
> @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
>  mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
>  	$Q $(RM) -f -- '$@'
>  	$Q : > '$@'
> +	$Q sh -- '$<' '$@' \
> +		HAVE_IBV_MLX4_WQE_LSO_SEG \
> +		infiniband/mlx4dv.h \
> +		type 'struct mlx4_wqe_lso_seg' \
> +		$(AUTOCONF_OUTPUT)
> 
>  # Create mlx4_autoconf.h or update it in case it differs from the new one.
> 
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> d151a90..5d8c76d 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -677,6 +677,15 @@ struct mlx4_conf {
> 
> 	IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DEBUG("FCS stripping toggling is %ssupported",
>  		      priv->hw_fcs_strip ? "" : "not ");
> +		priv->tso =
> +			((device_attr_ex.tso_caps.max_tso > 0) &&
> +			 (device_attr_ex.tso_caps.supported_qpts &
> +			  (1 << IBV_QPT_RAW_PACKET)));
> +		if (priv->tso)
> +			priv->tso_max_payload_sz =
> +					device_attr_ex.tso_caps.max_tso;
> +		DEBUG("TSO is %ssupported",
> +		      priv->tso ? "" : "not ");
>  		/* Configure the first MAC address by default. */
>  		err = mlx4_get_mac(priv, &mac.addr_bytes);
>  		if (err) {
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> 300cb4d..89d8c38 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -47,6 +47,9 @@
>  /** Interrupt alarm timeout value in microseconds. */  #define
> MLX4_INTR_ALARM_TIMEOUT 100000
> 
> +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> +MLX4_MAX_TSO_HEADER 192
> +
>  /** Port parameter. */
>  #define MLX4_PMD_PORT_KVARG "port"
> 
> @@ -90,6 +93,8 @@ struct priv {
>  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
>  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> */
>  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
> +	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> +	uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> size. */
>  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format). */
>  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
>  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index b771d8c..aef77ba 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -19,6 +19,7 @@
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> +#include "mlx4_autoconf.h"
> 
>  /* ConnectX-3 Tx queue basic block. */
>  #define MLX4_TXBB_SHIFT 6
> @@ -40,6 +41,7 @@
>  /* Work queue element (WQE) flags. */
>  #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)  #define
> MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> +#define MLX4_WQE_CTRL_RR (1 << 6)
> 
>  /* CQE checksum flags. */
>  enum {
> @@ -98,6 +100,19 @@ struct mlx4_cq {
>  	int arm_sn; /**< Rx event counter. */
>  };
> 
> +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> +/*
> + * WQE LSO segment structure.
> + * Defined here as backward compatibility for rdma-core v17 and below.
> + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
> + * and above.
> + */
> +struct mlx4_wqe_lso_seg {
> +	rte_be32_t mss_hdr_size;
> +	rte_be32_t header[];
> +};
> +#endif
> +
>  /**
>   * Retrieve a CQE entry from a CQ.
>   *
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 78b6dd5..750ad6d 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -38,10 +38,29 @@
>   * DWORD (32 byte) of a TXBB.
>   */
>  struct pv {
> -	volatile struct mlx4_wqe_data_seg *dseg;
> +	union {
> +		volatile struct mlx4_wqe_data_seg *dseg;
> +		volatile uint32_t *dst;
> +	};
>  	uint32_t val;
>  };
> 
> +/** A helper structure for TSO packet handling. */ struct tso_info {
> +	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> +	struct pv *pv;
> +	/** Current entry in the pv array. */
> +	int pv_counter;
> +	/** Total size of the WQE including padding. */
> +	uint32_t wqe_size;
> +	/** size of TSO header to prepend to each packet to send. */

size => Size

> +	uint16_t tso_header_sz;
 tso_header_sz = > t tso_header_size

"size" like the next fields name.

> +	/** Total size of the TSO segment in the WQE. */
> +	uint16_t wqe_tso_seg_size;
> +	/** Raw WQE size in units of 16 Bytes and without padding. */
> +	uint8_t fence_size;
> +};
> +
>  /** A table to translate Rx completion flags to packet type. */  uint32_t
> mlx4_ptype_table[0x100] __rte_cache_aligned = {
>  	/*
> @@ -368,6 +387,335 @@ struct pv {
>  }
> 
>  /**
> + * Obtain and calculate TSO information needed for assembling a TSO WQE.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to a structure to fill the info with.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> +	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
> +	if (tunneled)
> +		tinfo->tso_header_sz += buf->outer_l2_len + buf-
> >outer_l3_len;
> +	if (unlikely(buf->tso_segsz == 0 ||
> +		     tinfo->tso_header_sz == 0 ||
> +		     tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER ||
> +		     tinfo->tso_header_sz > buf->data_len))
> +		return -EINVAL;
> +	/*
> +	 * Calculate the WQE TSO segment size
> +	 * Note:
> +	 * 1. An LSO segment must be padded such that the subsequent data
> +	 *    segment is 16-byte aligned.
> +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> +	 */
> +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +    tinfo->tso_header_sz, sizeof(struct mlx4_wqe_data_seg));
> +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> +			     buf->nb_segs;
> +	tinfo->wqe_size =
> +		RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> MLX4_SEG_SHIFT),
> +			  MLX4_TXBB_SIZE);
> +	/* Validate WQE size and WQE space in the send queue. */
> +	if (sq->remain_size < tinfo->wqe_size ||
> +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> +		return -ENOMEM;
> +	/* Init pv. */
> +	tinfo->pv = (struct pv *)txq->bounce_buf;
> +	tinfo->pv_counter = 0;
> +	return 0;
> +}
> +
> +/**
> + * Fill the TSO WQE data segments with info on buffers to transmit .
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param dseg
> + *   Pointer to the first data segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_ctrl_seg *
> +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo,
> +			     volatile struct mlx4_wqe_data_seg *dseg,
> +			     volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	uint32_t lkey;
> +	int nb_segs = buf->nb_segs;
> +	int nb_segs_txbb;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct rte_mbuf *sbuf = buf;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	uint16_t sb_of = tinfo->tso_header_sz;
> +	uint16_t data_len;
> +
> +	while (nb_segs > 0) {

I think that here do while statement is better(no need the check in the first loop).

> +		/* how many dseg entries do we have in the current TXBB ?
> */
> +		nb_segs_txbb =
> +			(MLX4_TXBB_SIZE / sizeof(struct
> mlx4_wqe_data_seg)) -
> +			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
> +			sizeof(struct mlx4_wqe_data_seg);

Division may be expensive, you can avoid it by next:
nb_segs_txbb = (MLX4_TXBB_SIZE - ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> MLX4_SEG_SHIFT;

> +		switch (nb_segs_txbb) {
> +		case 4:
> +			/* Memory region key for this memory pool. */
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			dseg->addr =
> +
> rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> +								     uintptr_t,
> +								     sb_of));
> +			dseg->lkey = lkey;
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count
> writing
> +			 * for later.
> +			 */
> +			pv[*pv_counter].dseg = dseg;
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			data_len = sbuf->data_len - sb_of;
> +			pv[(*pv_counter)++].val =
> +				rte_cpu_to_be_32(data_len ?
> +						 data_len :
> +						 0x80000000);
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;

I think that here and in all the other cases it is better to do " return X" instead of break.
X is the same return value as now which can be calculated in the start.

> +			/* fallthrough */
> +		case 3:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 2:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				break;
> +			/* fallthrough */
> +		case 1:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto lkey_err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			--nb_segs;
> +			break;
> +		default:
> +			/* Should never happen */
> +			rte_panic("%p: Invalid number of SGEs(%d) for a
> TXBB",
> +				  (void *)txq, nb_segs_txbb);

I think we don't need the default case here, Do you have any scenario it may really happen?

> +		}
> +		/* Wrap dseg if it points at the end of the queue. */
> +		if ((volatile uint8_t *)dseg >= sq->eob)
> +			dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((volatile uint8_t *)dseg - sq->size);
> +	}
> +	/* Align next WQE address to the next TXBB. */
> +	return (volatile struct mlx4_wqe_ctrl_seg *)
> +		((volatile uint8_t *)ctrl + tinfo->wqe_size);
> +lkey_err:
> +	return NULL;
> +}
> +
> +/**
> + * Fill the packet's l2, l3 and l4 headers to the WQE.
> + *
> + * This will be used as the header for each TSO segment that is transmitted.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param ctrl
> + *   Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_data_seg *
> +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> +			   struct txq *txq,
> +			   struct tso_info *tinfo,
> +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_lso_seg *tseg =
> +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	int remain_sz = tinfo->tso_header_sz;
> +	char *from = rte_pktmbuf_mtod(buf, char *);
> +	uint16_t txbb_avail_space;
> +	int copy_sz;
> +	/* Union to overcome volatile constraints when copying TSO header.
> */
> +	union {
> +		volatile uint8_t *vto;
> +		uint8_t *to;
> +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> +
> +	/*
> +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> +	 * we can write the first 44 TSO header bytes without worry for TxQ
> +	 * wrapping or overwriting the first TXBB 32bit word.
> +	 */
> +	txbb_avail_space = MLX4_TXBB_SIZE -
> +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> +			    sizeof(struct mlx4_wqe_lso_seg));
> +	do {
> +		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
> +		rte_memcpy(thdr.to, from, copy_sz);
> +		remain_sz -= copy_sz;
> +		if (remain_sz <= 0)
> +			break;
> +		from += copy_sz;
> +		thdr.to += copy_sz;
> +		/* New TXBB, Check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		/* New TXBB, stash the first 32bits for later use. */
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
> +		rte_memcpy(&pv[*pv_counter].val, from,
> +			   RTE_MIN((size_t)remain_sz, sizeof(uint32_t)));
> +		(*pv_counter)++;
> +		from += sizeof(uint32_t);
> +		thdr.to += sizeof(uint32_t);
> +		remain_sz -= sizeof(uint32_t);
> +		/* Space in current TXBB is TXBB size - 4 */
> +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> +	} while (remain_sz > 0);

I think the loop can be better - you have now 5 checks per txbb, we can reduce it to 2 by next:

txbb_data_space = 44; (not include the first 4 bytes of the current txbb)

while (remain_size >= txbb_data_space + 4) // loop to write the tail of the current txbb + the head of the next txbb
	write txbb_data_space to the WQE.
	Check wrap around
	Write 4 bytes for the next txbb to pv
	remain_size -= 	txbb_data_space + 4
	txbb_data_space =60
	
if (remain_size > txbb_data_space) // write tail and partially head
	write txbb_data_space to the WQE.
	Check wrap around
	Write (remain_size - txbb_data_space) to pv
Else // write only tail
	write remain_size from the header

Am I missing something?
	
> +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> +					      tinfo->tso_header_sz);
> +	/* Calculate data segment location */
> +	return (volatile struct mlx4_wqe_data_seg *)
> +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> }
> +
> +/**
> + * Write data segments and header for TSO uni/multi segment packet.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param ctrl
> + *   Pointer to the WQE control segment.
> + *
> + * @return
> + *   Pointer to the next WQE control segment on success, NULL otherwise.
> + */
> +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> +rte_mbuf *buf, struct txq *txq,
> +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_data_seg *dseg;
> +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct tso_info tinfo;
> +	struct pv *pv;
> +	int pv_counter;
> +	int ret;
> +
> +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> +	if (unlikely(ret))
> +		goto error;
> +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> +	if (unlikely(dseg == NULL))
> +		goto error;
> +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> +		dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((uintptr_t)dseg - sq->size);
> +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> +	if (unlikely(ctrl_next == NULL))
> +		goto error;
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	if (tinfo.pv_counter) {

I think you can add here likely:

The minimum segments:
1. cntrl
2. header eth
3. header IP
4. header IP\tcp
5. at least 1 data segment.

Maybe even we don't need this check.

> +		pv = tinfo.pv;
> +		pv_counter = tinfo.pv_counter;
> +		/* Need a barrier here before writing the first TXBB word. */
> +		rte_io_wmb();
> +		for (--pv_counter; pv_counter  >= 0; pv_counter--)
> +			*pv[pv_counter].dst = pv[pv_counter].val;
> +	}
> +	ctrl->fence_size = tinfo.fence_size;
> +	sq->remain_size -= tinfo.wqe_size;
> +	return ctrl_next;
> +error:
> +	txq->stats.odropped++;
> +	return NULL;
> +}
> +
> +/**
>   * Write data segments of multi-segment packet.
>   *
>   * @param buf
> @@ -560,6 +908,7 @@ struct pv {
>  			uint16_t flags16[2];
>  		} srcrb;
>  		uint32_t lkey;
> +		bool tso = txq->priv->tso && (buf->ol_flags &
> PKT_TX_TCP_SEG);
> 
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -578,7 +927,16 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -		if (buf->nb_segs == 1) {
> +		if (tso) {
> +			/* Change opcode to TSO */
> +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> +			owner_opcode |= MLX4_OPCODE_LSO |
> MLX4_WQE_CTRL_RR;
> +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> +			if (!ctrl_next) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +		} else if (buf->nb_segs == 1) {
>  			/* Validate WQE space in the send queue. */
>  			if (sq->remain_size < MLX4_TXBB_SIZE) {
>  				elt->buf = NULL;
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 4c025e3..ffa8abf 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
>  	unsigned int idx; /**< Mapping index. */
>  	uint64_t opackets; /**< Total of successfully sent packets. */
>  	uint64_t obytes; /**< Total of successfully sent bytes. */
> -	uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> */
> +	uint64_t odropped; /**< Total number of packets failed to transmit.
> */
>  };
> 
>  /** Tx queue descriptor. */
> diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> index 6edaadb..9aa7440 100644
> --- a/drivers/net/mlx4/mlx4_txq.c
> +++ b/drivers/net/mlx4/mlx4_txq.c
> @@ -116,8 +116,14 @@
>  			     DEV_TX_OFFLOAD_UDP_CKSUM |
>  			     DEV_TX_OFFLOAD_TCP_CKSUM);
>  	}
> -	if (priv->hw_csum_l2tun)
> +	if (priv->tso)
> +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> +	if (priv->hw_csum_l2tun) {
>  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +		if (priv->tso)
> +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> +	}
>  	return offloads;
>  }
> 
> --
> 1.8.3.1
  

Patch

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@  Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@  Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 73f9d40..63bc003 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -85,6 +85,11 @@  mlx4_autoconf.h.new: FORCE
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..5d8c76d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -677,6 +677,15 @@  struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..89d8c38 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@ 
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@  struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b771d8c..aef77ba 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -19,6 +19,7 @@ 
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
+#include "mlx4_autoconf.h"
 
 /* ConnectX-3 Tx queue basic block. */
 #define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@ 
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -98,6 +100,19 @@  struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 78b6dd5..750ad6d 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,29 @@ 
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_sz;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -368,6 +387,335 @@  struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_sz == 0 ||
+		     tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_sz > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_sz,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	uint16_t sb_of = tinfo->tso_header_sz;
+	uint16_t data_len;
+
+	while (nb_segs > 0) {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb =
+			(MLX4_TXBB_SIZE / sizeof(struct mlx4_wqe_data_seg)) -
+			((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) /
+			sizeof(struct mlx4_wqe_data_seg);
+		switch (nb_segs_txbb) {
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			data_len = sbuf->data_len - sb_of;
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				break;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto lkey_err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			--nb_segs;
+			break;
+		default:
+			/* Should never happen */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+				  (void *)txq, nb_segs_txbb);
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	}
+	/* Align next WQE address to the next TXBB. */
+	return (volatile struct mlx4_wqe_ctrl_seg *)
+		((volatile uint8_t *)ctrl + tinfo->wqe_size);
+lkey_err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_sz = tinfo->tso_header_sz;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	int copy_sz;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	do {
+		copy_sz = RTE_MIN(txbb_avail_space, remain_sz);
+		rte_memcpy(thdr.to, from, copy_sz);
+		remain_sz -= copy_sz;
+		if (remain_sz <= 0)
+			break;
+		from += copy_sz;
+		thdr.to += copy_sz;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto;
+		rte_memcpy(&pv[*pv_counter].val, from,
+			   RTE_MIN((size_t)remain_sz, sizeof(uint32_t)));
+		(*pv_counter)++;
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_sz -= sizeof(uint32_t);
+		/* Space in current TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	} while (remain_sz > 0);
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_sz);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	if (tinfo.pv_counter) {
+		pv = tinfo.pv;
+		pv_counter = tinfo.pv_counter;
+		/* Need a barrier here before writing the first TXBB word. */
+		rte_io_wmb();
+		for (--pv_counter; pv_counter  >= 0; pv_counter--)
+			*pv[pv_counter].dst = pv[pv_counter].val;
+	}
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -560,6 +908,7 @@  struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -578,7 +927,16 @@  struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@  struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@ 
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }