[dpdk-dev,v2,7/7] net/mlx4: separate Tx for multi-segments
Checks
Commit Message
This commit optimizes handling of one segment and calls a
dedicated function for handling multi segments
Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
drivers/net/mlx4/mlx4_rxtx.c | 284 +++++++++++++++++++++++--------------------
1 file changed, 154 insertions(+), 130 deletions(-)
Comments
Hi Ophir,
On Mon, Oct 23, 2017 at 02:22:00PM +0000, Ophir Munk wrote:
> This commit optimizes handling of one segment and calls a
> dedicated function for handling multi segments
>
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
While it indeed moves the code to a separate function I'm not sure by how
much it improves performance.
Is it noticeably better, can you provide a short performance summary with
and without this patch? Is that the case for both single and multi-segment
scenarios, or was this improvement at the cost of a degradation in the
latter case?
If it splits a large function in two smaller ones for readability and no
performance validation was done on this specific patch alone, please not
label it as a performance improvement. I'm fine with readability
improvements when properly identified as such.
A few additional comments below.
> ---
> drivers/net/mlx4/mlx4_rxtx.c | 284 +++++++++++++++++++++++--------------------
> 1 file changed, 154 insertions(+), 130 deletions(-)
>
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 3236552..9596859 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -62,6 +62,9 @@
> #include "mlx4_rxtx.h"
> #include "mlx4_utils.h"
>
> +#define WQE_ONE_DATA_SEG_SIZE \
> + (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
> +
> /**
> * Pointer-value pair structure used in tx_post_send for saving the first
> * DWORD (32 byte) of a TXBB.
> @@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
> * @return
> * 0 on success, -1 on failure.
> */
> -static int
> -mlx4_txq_complete(struct txq *txq)
> +static inline int __attribute__((always_inline))
Should be static only, leave the rest to the compiler. This function is
large enough that it shouldn't make much of a difference anyway (unless
proved otherwise).
> +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> + struct mlx4_sq *sq)
> {
> unsigned int elts_comp = txq->elts_comp;
> unsigned int elts_tail = txq->elts_tail;
> - const unsigned int elts_n = txq->elts_n;
> struct mlx4_cq *cq = &txq->mcq;
> - struct mlx4_sq *sq = &txq->msq;
> struct mlx4_cqe *cqe;
> uint32_t cons_index = cq->cons_index;
> uint16_t new_index;
> uint16_t nr_txbbs = 0;
> int pkts = 0;
>
> - if (unlikely(elts_comp == 0))
> - return 0;
> /*
> * Traverse over all CQ entries reported and handle each WQ entry
> * reported by them.
> @@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
> return txq->mp2mr[i].lkey;
> }
>
> +static int handle_multi_segs(struct rte_mbuf *buf,
> + struct txq *txq,
> + struct mlx4_wqe_ctrl_seg **pctrl)
> +{
> + int wqe_real_size;
> + int nr_txbbs;
> + struct pv *pv = (struct pv *)txq->bounce_buf;
> + struct mlx4_sq *sq = &txq->msq;
> + uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> + struct mlx4_wqe_ctrl_seg *ctrl;
> + struct mlx4_wqe_data_seg *dseg;
> + uintptr_t addr;
> + uint32_t byte_count;
> + int pv_counter = 0;
> +
> + /* Calculate the needed work queue entry size for this packet. */
> + wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> + buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> + nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> + /*
> + * Check that there is room for this WQE in the send queue and that
> + * the WQE size is legal.
> + */
> + if (((sq->head - sq->tail) + nr_txbbs +
> + sq->headroom_txbbs) >= sq->txbb_cnt ||
> + nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> + return -1;
> + }
> +
> + /* Get the control and data entries of the WQE. */
> + ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
> + dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> + sizeof(struct mlx4_wqe_ctrl_seg));
> + *pctrl = ctrl;
> + /* Fill the data segments with buffer information. */
> + struct rte_mbuf *sbuf;
> +
> + for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> + addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> + rte_prefetch0((volatile void *)addr);
> + /* Handle WQE wraparound. */
> + if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
> + dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> + dseg->addr = rte_cpu_to_be_64(addr);
> + /* Memory region key (big endian) for this memory pool. */
> + dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> +#ifndef NDEBUG
> + /* Calculate the needed work queue entry size for this packet */
> + if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> + /* MR does not exist. */
> + DEBUG("%p: unable to get MP <-> MR association",
> + (void *)txq);
> + /*
> + * Restamp entry in case of failure.
> + * Make sure that size is written correctly
> + * Note that we give ownership to the SW, not the HW.
> + */
> + wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> + buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> + mlx4_txq_stamp_freed_wqe(sq, head_idx,
> + (sq->head & sq->txbb_cnt) ? 0 : 1);
> + return -1;
> + }
> +#endif /* NDEBUG */
> + if (likely(sbuf->data_len)) {
> + byte_count = rte_cpu_to_be_32(sbuf->data_len);
> + } else {
> + /*
> + * Zero length segment is treated as inline segment
> + * with zero data.
> + */
> + byte_count = RTE_BE32(0x80000000);
> + }
> + /*
> + * If the data segment is not at the beginning of a
> + * Tx basic block (TXBB) then write the byte count,
> + * else postpone the writing to just before updating the
> + * control segment.
> + */
> + if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> + /*
> + * Need a barrier here before writing the byte_count
> + * fields to make sure that all the data is visible
> + * before the byte_count field is set.
> + * Otherwise, if the segment begins a new cacheline,
> + * the HCA prefetcher could grab the 64-byte chunk and
> + * get a valid (!= 0xffffffff) byte count but stale
> + * data, and end up sending the wrong data.
> + */
> + rte_io_wmb();
> + dseg->byte_count = byte_count;
> + } else {
> + /*
> + * This data segment starts at the beginning of a new
> + * TXBB, so we need to postpone its byte_count writing
> + * for later.
> + */
> + pv[pv_counter].dseg = dseg;
> + pv[pv_counter++].val = byte_count;
> + }
> + }
> + /* Write the first DWORD of each TXBB save earlier. */
> + if (pv_counter) {
> + /* Need a barrier here before writing the byte_count. */
> + rte_io_wmb();
> + for (--pv_counter; pv_counter >= 0; pv_counter--)
> + pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
> + }
> + /* Fill the control parameters for this packet. */
> + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +
> + return nr_txbbs;
> +}
> /**
> * DPDK callback for Tx.
> *
> @@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> unsigned int i;
> unsigned int max;
> struct mlx4_sq *sq = &txq->msq;
> - struct pv *pv = (struct pv *)txq->bounce_buf;
> + int nr_txbbs;
>
> assert(txq->elts_comp_cd != 0);
> - mlx4_txq_complete(txq);
> + if (likely(txq->elts_comp != 0))
> + mlx4_txq_complete(txq, elts_n, sq);
> max = (elts_n - (elts_head - txq->elts_tail));
> if (max > elts_n)
> max -= elts_n;
> @@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> } srcrb;
> uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> uintptr_t addr;
> - uint32_t byte_count;
> - int wqe_real_size;
> - int nr_txbbs;
> - int pv_counter = 0;
>
> /* Clean up old buffer. */
> if (likely(elt->buf != NULL)) {
> @@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> } while (tmp != NULL);
> }
> RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -
> - /*
> - * Calculate the needed work queue entry size
> - * for this packet.
> - */
> - wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> - buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> - nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> - /*
> - * Check that there is room for this WQE in the send
> - * queue and that the WQE size is legal.
> - */
> - if (((sq->head - sq->tail) + nr_txbbs +
> - sq->headroom_txbbs) >= sq->txbb_cnt ||
> - nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> - elt->buf = NULL;
> - break;
> - }
> - /* Get the control and data entries of the WQE. */
> - ctrl = (struct mlx4_wqe_ctrl_seg *)
> - mlx4_get_send_wqe(sq, head_idx);
> - dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> - sizeof(struct mlx4_wqe_ctrl_seg));
> - /* Fill the data segments with buffer information. */
> if (likely(buf->nb_segs == 1)) {
> + /*
> + * Check that there is room for this WQE in the send
> + * queue and that the WQE size is legal
> + */
> + if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
> + >= sq->txbb_cnt ||
> + 1 > MLX4_MAX_WQE_TXBBS) {
> + elt->buf = NULL;
> + break;
> + }
> + /* Get the control and data entries of the WQE. */
> + ctrl = (struct mlx4_wqe_ctrl_seg *)
> + mlx4_get_send_wqe(sq, head_idx);
> + dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> + sizeof(struct mlx4_wqe_ctrl_seg));
> addr = rte_pktmbuf_mtod(buf, uintptr_t);
> rte_prefetch0((volatile void *)addr);
> /* Handle WQE wraparound. */
> @@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> dseg->addr = rte_cpu_to_be_64(addr);
> /* Memory region key (big endian). */
> - dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> - #ifndef NDEBUG
> + dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
> +#ifndef NDEBUG
> if (unlikely(dseg->lkey ==
> rte_cpu_to_be_32((uint32_t)-1))) {
> /* MR does not exist. */
> DEBUG("%p: unable to get MP <-> MR association",
> - (void *)txq);
> + (void *)txq);
> /*
> * Restamp entry in case of failure.
> * Make sure that size is written correctly
> * Note that we give ownership to the SW,
> * not the HW.
> */
> - ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> + ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
> + & 0x3f;
> mlx4_txq_stamp_freed_wqe(sq, head_idx,
> - (sq->head & sq->txbb_cnt) ? 0 : 1);
> + (sq->head & sq->txbb_cnt) ? 0 : 1);
> elt->buf = NULL;
> break;
> }
> - #endif /* NDEBUG */
> +#endif /* NDEBUG */
> /* Need a barrier here before writing the byte_count. */
> rte_io_wmb();
> dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
> +
> + /* Fill the control parameters for this packet. */
> + ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
> + nr_txbbs = 1;
> } else {
> - /* Fill the data segments with buffer information. */
> - struct rte_mbuf *sbuf;
> -
> - for (sbuf = buf;
> - sbuf != NULL;
> - sbuf = sbuf->next, dseg++) {
> - addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> - rte_prefetch0((volatile void *)addr);
> - /* Handle WQE wraparound. */
> - if (unlikely(dseg >=
> - (struct mlx4_wqe_data_seg *)sq->eob))
> - dseg = (struct mlx4_wqe_data_seg *)
> - sq->buf;
> - dseg->addr = rte_cpu_to_be_64(addr);
> - /* Memory region key (big endian). */
> - dseg->lkey = mlx4_txq_mp2mr(txq,
> - mlx4_txq_mb2mp(sbuf));
> - #ifndef NDEBUG
> - if (unlikely(dseg->lkey ==
> - rte_cpu_to_be_32((uint32_t)-1))) {
> - /* MR does not exist. */
> - DEBUG("%p: unable to get MP <-> MR association",
> - (void *)txq);
> - /*
> - * Restamp entry in case of failure.
> - * Make sure that size is written
> - * correctly, note that we give
> - * ownership to the SW, not the HW.
> - */
> - ctrl->fence_size =
> - (wqe_real_size >> 4) & 0x3f;
> - mlx4_txq_stamp_freed_wqe(sq, head_idx,
> - (sq->head & sq->txbb_cnt) ? 0 : 1);
> - elt->buf = NULL;
> - break;
> - }
> - #endif /* NDEBUG */
> - if (likely(sbuf->data_len)) {
> - byte_count =
> - rte_cpu_to_be_32(sbuf->data_len);
> - } else {
> - /*
> - * Zero length segment is treated as
> - * inline segment with zero data.
> - */
> - byte_count = RTE_BE32(0x80000000);
> - }
> - /*
> - * If the data segment is not at the beginning
> - * of a Tx basic block (TXBB) then write the
> - * byte count, else postpone the writing to
> - * just before updating the control segment.
> - */
> - if ((uintptr_t)dseg &
> - (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> - /*
> - * Need a barrier here before writing
> - * the byte_count fields to make sure
> - * that all the data is visible before
> - * the byte_count field is set.
> - * Otherwise, if the segment begins a
> - * new cacheline, the HCA prefetcher
> - * could grab the 64-byte chunk and get
> - * a valid (!= 0xffffffff) byte count
> - * but stale data, and end up sending
> - * the wrong data.
> - */
> - rte_io_wmb();
> - dseg->byte_count = byte_count;
> - } else {
> - /*
> - * This data segment starts at the
> - * beginning of a new TXBB, so we
> - * need to postpone its byte_count
> - * writing for later.
> - */
> - pv[pv_counter].dseg = dseg;
> - pv[pv_counter++].val = byte_count;
> - }
> + nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
Having all this part non-inline could degrade multi-segment performance, is
that OK?
> + if (nr_txbbs < 0) {
> + elt->buf = NULL;
> + break;
> }
> - /* Write the first DWORD of each TXBB save earlier. */
> - if (pv_counter) {
> - /* Need a barrier before writing the byte_count. */
> - rte_io_wmb();
> - for (--pv_counter; pv_counter >= 0; pv_counter--)
> - pv[pv_counter].dseg->byte_count =
> - pv[pv_counter].val;
> }
> - /* Fill the control parameters for this packet. */
> - ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> +
> /*
> * For raw Ethernet, the SOLICIT flag is used to indicate
> * that no ICRC should be calculated.
> --
> 2.7.4
>
Hi,
Please see inline.
On Wednesday, October 25, 2017 7:50 PM Adrien Mazarguil wrote:
>
> Hi Ophir,
>
> On Mon, Oct 23, 2017 at 02:22:00PM +0000, Ophir Munk wrote:
> > This commit optimizes handling of one segment and calls a dedicated
> > function for handling multi segments
> >
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
>
> While it indeed moves the code to a separate function I'm not sure by how
> much it improves performance.
>
> Is it noticeably better, can you provide a short performance summary with
> and without this patch? Is that the case for both single and multi-segment
> scenarios, or was this improvement at the cost of a degradation in the latter
> case?
>
In v3 this commit is squashed into the previous commit "net/mlx4: improve performance of one Tx segment"
as both commits represent one logic unit.
On Matan's setup performance improvement with those 2 commits occurs for both single and multi-segment scenarios.
On my setup performance improvement occurs for single segment only:
With patch versus without patch:
64 +0.2 mpps
64,64 -0.2 mpps
64,64,64,64 -0.07 mpps
> If it splits a large function in two smaller ones for readability and no
> performance validation was done on this specific patch alone, please not
> label it as a performance improvement. I'm fine with readability
> improvements when properly identified as such.
>
Performance improvement indication was removed from commit message.
> A few additional comments below.
>
> > ---
> > drivers/net/mlx4/mlx4_rxtx.c | 284
> > +++++++++++++++++++++++--------------------
> > 1 file changed, 154 insertions(+), 130 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 3236552..9596859 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -62,6 +62,9 @@
> > #include "mlx4_rxtx.h"
> > #include "mlx4_utils.h"
> >
> > +#define WQE_ONE_DATA_SEG_SIZE \
> > + (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct
> > +mlx4_wqe_data_seg))
> > +
> > /**
> > * Pointer-value pair structure used in tx_post_send for saving the first
> > * DWORD (32 byte) of a TXBB.
> > @@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq,
> uint16_t index, uint8_t owner)
> > * @return
> > * 0 on success, -1 on failure.
> > */
> > -static int
> > -mlx4_txq_complete(struct txq *txq)
> > +static inline int __attribute__((always_inline))
>
> Should be static only, leave the rest to the compiler. This function is large
> enough that it shouldn't make much of a difference anyway (unless proved
> otherwise).
>
Done.
__attribute__((always_inline)) was removed.
> > +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
> > + struct mlx4_sq *sq)
> > {
> > unsigned int elts_comp = txq->elts_comp;
> > unsigned int elts_tail = txq->elts_tail;
> > - const unsigned int elts_n = txq->elts_n;
> > struct mlx4_cq *cq = &txq->mcq;
> > - struct mlx4_sq *sq = &txq->msq;
> > struct mlx4_cqe *cqe;
> > uint32_t cons_index = cq->cons_index;
> > uint16_t new_index;
> > uint16_t nr_txbbs = 0;
> > int pkts = 0;
> >
> > - if (unlikely(elts_comp == 0))
> > - return 0;
> > /*
> > * Traverse over all CQ entries reported and handle each WQ entry
> > * reported by them.
> > @@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq,
> struct rte_mempool *mp, uint32_t i)
> > return txq->mp2mr[i].lkey;
> > }
> >
> > +static int handle_multi_segs(struct rte_mbuf *buf,
> > + struct txq *txq,
> > + struct mlx4_wqe_ctrl_seg **pctrl) {
> > + int wqe_real_size;
> > + int nr_txbbs;
> > + struct pv *pv = (struct pv *)txq->bounce_buf;
> > + struct mlx4_sq *sq = &txq->msq;
> > + uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> > + struct mlx4_wqe_ctrl_seg *ctrl;
> > + struct mlx4_wqe_data_seg *dseg;
> > + uintptr_t addr;
> > + uint32_t byte_count;
> > + int pv_counter = 0;
> > +
> > + /* Calculate the needed work queue entry size for this packet. */
> > + wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > + buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
> > + nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > + /*
> > + * Check that there is room for this WQE in the send queue and that
> > + * the WQE size is legal.
> > + */
> > + if (((sq->head - sq->tail) + nr_txbbs +
> > + sq->headroom_txbbs) >= sq->txbb_cnt ||
> > + nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > + return -1;
> > + }
> > +
> > + /* Get the control and data entries of the WQE. */
> > + ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq,
> head_idx);
> > + dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > + sizeof(struct mlx4_wqe_ctrl_seg));
> > + *pctrl = ctrl;
> > + /* Fill the data segments with buffer information. */
> > + struct rte_mbuf *sbuf;
> > +
> > + for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
> > + addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > + rte_prefetch0((volatile void *)addr);
> > + /* Handle WQE wraparound. */
> > + if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
> > + dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> > + dseg->addr = rte_cpu_to_be_64(addr);
> > + /* Memory region key (big endian) for this memory pool. */
> > + dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
> #ifndef
> > +NDEBUG
> > + /* Calculate the needed work queue entry size for this packet
> */
> > + if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
> > + /* MR does not exist. */
> > + DEBUG("%p: unable to get MP <-> MR association",
> > + (void *)txq);
> > + /*
> > + * Restamp entry in case of failure.
> > + * Make sure that size is written correctly
> > + * Note that we give ownership to the SW, not the
> HW.
> > + */
> > + wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > + buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > + mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > + (sq->head & sq->txbb_cnt) ? 0 : 1);
> > + return -1;
> > + }
> > +#endif /* NDEBUG */
> > + if (likely(sbuf->data_len)) {
> > + byte_count = rte_cpu_to_be_32(sbuf->data_len);
> > + } else {
> > + /*
> > + * Zero length segment is treated as inline segment
> > + * with zero data.
> > + */
> > + byte_count = RTE_BE32(0x80000000);
> > + }
> > + /*
> > + * If the data segment is not at the beginning of a
> > + * Tx basic block (TXBB) then write the byte count,
> > + * else postpone the writing to just before updating the
> > + * control segment.
> > + */
> > + if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > + /*
> > + * Need a barrier here before writing the byte_count
> > + * fields to make sure that all the data is visible
> > + * before the byte_count field is set.
> > + * Otherwise, if the segment begins a new cacheline,
> > + * the HCA prefetcher could grab the 64-byte chunk
> and
> > + * get a valid (!= 0xffffffff) byte count but stale
> > + * data, and end up sending the wrong data.
> > + */
> > + rte_io_wmb();
> > + dseg->byte_count = byte_count;
> > + } else {
> > + /*
> > + * This data segment starts at the beginning of a new
> > + * TXBB, so we need to postpone its byte_count
> writing
> > + * for later.
> > + */
> > + pv[pv_counter].dseg = dseg;
> > + pv[pv_counter++].val = byte_count;
> > + }
> > + }
> > + /* Write the first DWORD of each TXBB save earlier. */
> > + if (pv_counter) {
> > + /* Need a barrier here before writing the byte_count. */
> > + rte_io_wmb();
> > + for (--pv_counter; pv_counter >= 0; pv_counter--)
> > + pv[pv_counter].dseg->byte_count =
> pv[pv_counter].val;
> > + }
> > + /* Fill the control parameters for this packet. */
> > + ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> > + return nr_txbbs;
> > +}
> > /**
> > * DPDK callback for Tx.
> > *
> > @@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> > unsigned int i;
> > unsigned int max;
> > struct mlx4_sq *sq = &txq->msq;
> > - struct pv *pv = (struct pv *)txq->bounce_buf;
> > + int nr_txbbs;
> >
> > assert(txq->elts_comp_cd != 0);
> > - mlx4_txq_complete(txq);
> > + if (likely(txq->elts_comp != 0))
> > + mlx4_txq_complete(txq, elts_n, sq);
> > max = (elts_n - (elts_head - txq->elts_tail));
> > if (max > elts_n)
> > max -= elts_n;
> > @@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> > } srcrb;
> > uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
> > uintptr_t addr;
> > - uint32_t byte_count;
> > - int wqe_real_size;
> > - int nr_txbbs;
> > - int pv_counter = 0;
> >
> > /* Clean up old buffer. */
> > if (likely(elt->buf != NULL)) {
> > @@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> > } while (tmp != NULL);
> > }
> > RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -
> > - /*
> > - * Calculate the needed work queue entry size
> > - * for this packet.
> > - */
> > - wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
> > - buf->nb_segs * sizeof(struct
> mlx4_wqe_data_seg);
> > - nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
> > - /*
> > - * Check that there is room for this WQE in the send
> > - * queue and that the WQE size is legal.
> > - */
> > - if (((sq->head - sq->tail) + nr_txbbs +
> > - sq->headroom_txbbs) >= sq->txbb_cnt ||
> > - nr_txbbs > MLX4_MAX_WQE_TXBBS) {
> > - elt->buf = NULL;
> > - break;
> > - }
> > - /* Get the control and data entries of the WQE. */
> > - ctrl = (struct mlx4_wqe_ctrl_seg *)
> > - mlx4_get_send_wqe(sq, head_idx);
> > - dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > - sizeof(struct mlx4_wqe_ctrl_seg));
> > - /* Fill the data segments with buffer information. */
> > if (likely(buf->nb_segs == 1)) {
> > + /*
> > + * Check that there is room for this WQE in the send
> > + * queue and that the WQE size is legal
> > + */
> > + if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
> > + >= sq->txbb_cnt ||
> > + 1 >
> MLX4_MAX_WQE_TXBBS) {
> > + elt->buf = NULL;
> > + break;
> > + }
> > + /* Get the control and data entries of the WQE. */
> > + ctrl = (struct mlx4_wqe_ctrl_seg *)
> > + mlx4_get_send_wqe(sq, head_idx);
> > + dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
> > + sizeof(struct mlx4_wqe_ctrl_seg));
> > addr = rte_pktmbuf_mtod(buf, uintptr_t);
> > rte_prefetch0((volatile void *)addr);
> > /* Handle WQE wraparound. */
> > @@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf
> **pkts, uint16_t pkts_n)
> > dseg = (struct mlx4_wqe_data_seg *)sq->buf;
> > dseg->addr = rte_cpu_to_be_64(addr);
> > /* Memory region key (big endian). */
> > - dseg->lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(sbuf));
> > - #ifndef NDEBUG
> > + dseg->lkey = mlx4_txq_mp2mr(txq,
> mlx4_txq_mb2mp(buf)); #ifndef
> > +NDEBUG
> > if (unlikely(dseg->lkey ==
> > rte_cpu_to_be_32((uint32_t)-1))) {
> > /* MR does not exist. */
> > DEBUG("%p: unable to get MP <-> MR
> association",
> > - (void *)txq);
> > + (void *)txq);
> > /*
> > * Restamp entry in case of failure.
> > * Make sure that size is written correctly
> > * Note that we give ownership to the SW,
> > * not the HW.
> > */
> > - ctrl->fence_size = (wqe_real_size >> 4) &
> 0x3f;
> > + ctrl->fence_size =
> (WQE_ONE_DATA_SEG_SIZE >> 4)
> > + & 0x3f;
> > mlx4_txq_stamp_freed_wqe(sq, head_idx,
> > - (sq->head & sq->txbb_cnt) ? 0 : 1);
> > + (sq->head & sq->txbb_cnt) ? 0 : 1);
> > elt->buf = NULL;
> > break;
> > }
> > - #endif /* NDEBUG */
> > +#endif /* NDEBUG */
> > /* Need a barrier here before writing the
> byte_count. */
> > rte_io_wmb();
> > dseg->byte_count = rte_cpu_to_be_32(buf-
> >data_len);
> > +
> > + /* Fill the control parameters for this packet. */
> > + ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
> & 0x3f;
> > + nr_txbbs = 1;
> > } else {
> > - /* Fill the data segments with buffer information. */
> > - struct rte_mbuf *sbuf;
> > -
> > - for (sbuf = buf;
> > - sbuf != NULL;
> > - sbuf = sbuf->next, dseg++) {
> > - addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
> > - rte_prefetch0((volatile void *)addr);
> > - /* Handle WQE wraparound. */
> > - if (unlikely(dseg >=
> > - (struct mlx4_wqe_data_seg *)sq-
> >eob))
> > - dseg = (struct mlx4_wqe_data_seg *)
> > - sq->buf;
> > - dseg->addr = rte_cpu_to_be_64(addr);
> > - /* Memory region key (big endian). */
> > - dseg->lkey = mlx4_txq_mp2mr(txq,
> > - mlx4_txq_mb2mp(sbuf));
> > - #ifndef NDEBUG
> > - if (unlikely(dseg->lkey ==
> > - rte_cpu_to_be_32((uint32_t)-1))) {
> > - /* MR does not exist. */
> > - DEBUG("%p: unable to get MP <->
> MR association",
> > - (void *)txq);
> > - /*
> > - * Restamp entry in case of failure.
> > - * Make sure that size is written
> > - * correctly, note that we give
> > - * ownership to the SW, not the HW.
> > - */
> > - ctrl->fence_size =
> > - (wqe_real_size >> 4) & 0x3f;
> > - mlx4_txq_stamp_freed_wqe(sq,
> head_idx,
> > - (sq->head & sq->txbb_cnt) ? 0 : 1);
> > - elt->buf = NULL;
> > - break;
> > - }
> > - #endif /* NDEBUG */
> > - if (likely(sbuf->data_len)) {
> > - byte_count =
> > - rte_cpu_to_be_32(sbuf->data_len);
> > - } else {
> > - /*
> > - * Zero length segment is treated as
> > - * inline segment with zero data.
> > - */
> > - byte_count =
> RTE_BE32(0x80000000);
> > - }
> > - /*
> > - * If the data segment is not at the beginning
> > - * of a Tx basic block (TXBB) then write the
> > - * byte count, else postpone the writing to
> > - * just before updating the control segment.
> > - */
> > - if ((uintptr_t)dseg &
> > - (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
> > - /*
> > - * Need a barrier here before writing
> > - * the byte_count fields to make sure
> > - * that all the data is visible before
> > - * the byte_count field is set.
> > - * Otherwise, if the segment begins a
> > - * new cacheline, the HCA prefetcher
> > - * could grab the 64-byte chunk and
> get
> > - * a valid (!= 0xffffffff) byte count
> > - * but stale data, and end up sending
> > - * the wrong data.
> > - */
> > - rte_io_wmb();
> > - dseg->byte_count = byte_count;
> > - } else {
> > - /*
> > - * This data segment starts at the
> > - * beginning of a new TXBB, so we
> > - * need to postpone its byte_count
> > - * writing for later.
> > - */
> > - pv[pv_counter].dseg = dseg;
> > - pv[pv_counter++].val = byte_count;
> > - }
> > + nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
>
> Having all this part non-inline could degrade multi-segment performance, is
> that OK?
It is inline because it is defined as static. Performance is not degraded in this case.
>
> > + if (nr_txbbs < 0) {
> > + elt->buf = NULL;
> > + break;
> > }
> > - /* Write the first DWORD of each TXBB save earlier. */
> > - if (pv_counter) {
> > - /* Need a barrier before writing the byte_count. */
> > - rte_io_wmb();
> > - for (--pv_counter; pv_counter >= 0; pv_counter--)
> > - pv[pv_counter].dseg->byte_count =
> > - pv[pv_counter].val;
> > }
> > - /* Fill the control parameters for this packet. */
> > - ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
> > +
> > /*
> > * For raw Ethernet, the SOLICIT flag is used to indicate
> > * that no ICRC should be calculated.
> > --
> > 2.7.4
> >
>
> --
> Adrien Mazarguil
> 6WIND
@@ -62,6 +62,9 @@
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
+#define WQE_ONE_DATA_SEG_SIZE \
+ (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))
+
/**
* Pointer-value pair structure used in tx_post_send for saving the first
* DWORD (32 byte) of a TXBB.
@@ -140,22 +143,19 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner)
* @return
* 0 on success, -1 on failure.
*/
-static int
-mlx4_txq_complete(struct txq *txq)
+static inline int __attribute__((always_inline))
+mlx4_txq_complete(struct txq *txq, const unsigned int elts_n,
+ struct mlx4_sq *sq)
{
unsigned int elts_comp = txq->elts_comp;
unsigned int elts_tail = txq->elts_tail;
- const unsigned int elts_n = txq->elts_n;
struct mlx4_cq *cq = &txq->mcq;
- struct mlx4_sq *sq = &txq->msq;
struct mlx4_cqe *cqe;
uint32_t cons_index = cq->cons_index;
uint16_t new_index;
uint16_t nr_txbbs = 0;
int pkts = 0;
- if (unlikely(elts_comp == 0))
- return 0;
/*
* Traverse over all CQ entries reported and handle each WQ entry
* reported by them.
@@ -266,6 +266,120 @@ rte_be32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
return txq->mp2mr[i].lkey;
}
+static int handle_multi_segs(struct rte_mbuf *buf,
+ struct txq *txq,
+ struct mlx4_wqe_ctrl_seg **pctrl)
+{
+ int wqe_real_size;
+ int nr_txbbs;
+ struct pv *pv = (struct pv *)txq->bounce_buf;
+ struct mlx4_sq *sq = &txq->msq;
+ uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_data_seg *dseg;
+ uintptr_t addr;
+ uint32_t byte_count;
+ int pv_counter = 0;
+
+ /* Calculate the needed work queue entry size for this packet. */
+ wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+ buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+ nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
+ /*
+ * Check that there is room for this WQE in the send queue and that
+ * the WQE size is legal.
+ */
+ if (((sq->head - sq->tail) + nr_txbbs +
+ sq->headroom_txbbs) >= sq->txbb_cnt ||
+ nr_txbbs > MLX4_MAX_WQE_TXBBS) {
+ return -1;
+ }
+
+ /* Get the control and data entries of the WQE. */
+ ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx);
+ dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+ sizeof(struct mlx4_wqe_ctrl_seg));
+ *pctrl = ctrl;
+ /* Fill the data segments with buffer information. */
+ struct rte_mbuf *sbuf;
+
+ for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) {
+ addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
+ rte_prefetch0((volatile void *)addr);
+ /* Handle WQE wraparound. */
+ if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob))
+ dseg = (struct mlx4_wqe_data_seg *)sq->buf;
+ dseg->addr = rte_cpu_to_be_64(addr);
+ /* Memory region key (big endian) for this memory pool. */
+ dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
+#ifndef NDEBUG
+ /* Calculate the needed work queue entry size for this packet */
+ if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR association",
+ (void *)txq);
+ /*
+ * Restamp entry in case of failure.
+ * Make sure that size is written correctly
+ * Note that we give ownership to the SW, not the HW.
+ */
+ wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
+ buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
+ ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+ mlx4_txq_stamp_freed_wqe(sq, head_idx,
+ (sq->head & sq->txbb_cnt) ? 0 : 1);
+ return -1;
+ }
+#endif /* NDEBUG */
+ if (likely(sbuf->data_len)) {
+ byte_count = rte_cpu_to_be_32(sbuf->data_len);
+ } else {
+ /*
+ * Zero length segment is treated as inline segment
+ * with zero data.
+ */
+ byte_count = RTE_BE32(0x80000000);
+ }
+ /*
+ * If the data segment is not at the beginning of a
+ * Tx basic block (TXBB) then write the byte count,
+ * else postpone the writing to just before updating the
+ * control segment.
+ */
+ if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
+ /*
+ * Need a barrier here before writing the byte_count
+ * fields to make sure that all the data is visible
+ * before the byte_count field is set.
+ * Otherwise, if the segment begins a new cacheline,
+ * the HCA prefetcher could grab the 64-byte chunk and
+ * get a valid (!= 0xffffffff) byte count but stale
+ * data, and end up sending the wrong data.
+ */
+ rte_io_wmb();
+ dseg->byte_count = byte_count;
+ } else {
+ /*
+ * This data segment starts at the beginning of a new
+ * TXBB, so we need to postpone its byte_count writing
+ * for later.
+ */
+ pv[pv_counter].dseg = dseg;
+ pv[pv_counter++].val = byte_count;
+ }
+ }
+ /* Write the first DWORD of each TXBB save earlier. */
+ if (pv_counter) {
+ /* Need a barrier here before writing the byte_count. */
+ rte_io_wmb();
+ for (--pv_counter; pv_counter >= 0; pv_counter--)
+ pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
+ }
+ /* Fill the control parameters for this packet. */
+ ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
+ return nr_txbbs;
+}
/**
* DPDK callback for Tx.
*
@@ -288,10 +402,11 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int i;
unsigned int max;
struct mlx4_sq *sq = &txq->msq;
- struct pv *pv = (struct pv *)txq->bounce_buf;
+ int nr_txbbs;
assert(txq->elts_comp_cd != 0);
- mlx4_txq_complete(txq);
+ if (likely(txq->elts_comp != 0))
+ mlx4_txq_complete(txq, elts_n, sq);
max = (elts_n - (elts_head - txq->elts_tail));
if (max > elts_n)
max -= elts_n;
@@ -316,10 +431,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
} srcrb;
uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
uintptr_t addr;
- uint32_t byte_count;
- int wqe_real_size;
- int nr_txbbs;
- int pv_counter = 0;
/* Clean up old buffer. */
if (likely(elt->buf != NULL)) {
@@ -338,31 +449,22 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
} while (tmp != NULL);
}
RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-
- /*
- * Calculate the needed work queue entry size
- * for this packet.
- */
- wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) +
- buf->nb_segs * sizeof(struct mlx4_wqe_data_seg);
- nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size);
- /*
- * Check that there is room for this WQE in the send
- * queue and that the WQE size is legal.
- */
- if (((sq->head - sq->tail) + nr_txbbs +
- sq->headroom_txbbs) >= sq->txbb_cnt ||
- nr_txbbs > MLX4_MAX_WQE_TXBBS) {
- elt->buf = NULL;
- break;
- }
- /* Get the control and data entries of the WQE. */
- ctrl = (struct mlx4_wqe_ctrl_seg *)
- mlx4_get_send_wqe(sq, head_idx);
- dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
- sizeof(struct mlx4_wqe_ctrl_seg));
- /* Fill the data segments with buffer information. */
if (likely(buf->nb_segs == 1)) {
+ /*
+ * Check that there is room for this WQE in the send
+ * queue and that the WQE size is legal
+ */
+ if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs)
+ >= sq->txbb_cnt ||
+ 1 > MLX4_MAX_WQE_TXBBS) {
+ elt->buf = NULL;
+ break;
+ }
+ /* Get the control and data entries of the WQE. */
+ ctrl = (struct mlx4_wqe_ctrl_seg *)
+ mlx4_get_send_wqe(sq, head_idx);
+ dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl +
+ sizeof(struct mlx4_wqe_ctrl_seg));
addr = rte_pktmbuf_mtod(buf, uintptr_t);
rte_prefetch0((volatile void *)addr);
/* Handle WQE wraparound. */
@@ -371,120 +473,42 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
dseg = (struct mlx4_wqe_data_seg *)sq->buf;
dseg->addr = rte_cpu_to_be_64(addr);
/* Memory region key (big endian). */
- dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf));
- #ifndef NDEBUG
+ dseg->lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+#ifndef NDEBUG
if (unlikely(dseg->lkey ==
rte_cpu_to_be_32((uint32_t)-1))) {
/* MR does not exist. */
DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
+ (void *)txq);
/*
* Restamp entry in case of failure.
* Make sure that size is written correctly
* Note that we give ownership to the SW,
* not the HW.
*/
- ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+ ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4)
+ & 0x3f;
mlx4_txq_stamp_freed_wqe(sq, head_idx,
- (sq->head & sq->txbb_cnt) ? 0 : 1);
+ (sq->head & sq->txbb_cnt) ? 0 : 1);
elt->buf = NULL;
break;
}
- #endif /* NDEBUG */
+#endif /* NDEBUG */
/* Need a barrier here before writing the byte_count. */
rte_io_wmb();
dseg->byte_count = rte_cpu_to_be_32(buf->data_len);
+
+ /* Fill the control parameters for this packet. */
+ ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f;
+ nr_txbbs = 1;
} else {
- /* Fill the data segments with buffer information. */
- struct rte_mbuf *sbuf;
-
- for (sbuf = buf;
- sbuf != NULL;
- sbuf = sbuf->next, dseg++) {
- addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
- rte_prefetch0((volatile void *)addr);
- /* Handle WQE wraparound. */
- if (unlikely(dseg >=
- (struct mlx4_wqe_data_seg *)sq->eob))
- dseg = (struct mlx4_wqe_data_seg *)
- sq->buf;
- dseg->addr = rte_cpu_to_be_64(addr);
- /* Memory region key (big endian). */
- dseg->lkey = mlx4_txq_mp2mr(txq,
- mlx4_txq_mb2mp(sbuf));
- #ifndef NDEBUG
- if (unlikely(dseg->lkey ==
- rte_cpu_to_be_32((uint32_t)-1))) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR association",
- (void *)txq);
- /*
- * Restamp entry in case of failure.
- * Make sure that size is written
- * correctly, note that we give
- * ownership to the SW, not the HW.
- */
- ctrl->fence_size =
- (wqe_real_size >> 4) & 0x3f;
- mlx4_txq_stamp_freed_wqe(sq, head_idx,
- (sq->head & sq->txbb_cnt) ? 0 : 1);
- elt->buf = NULL;
- break;
- }
- #endif /* NDEBUG */
- if (likely(sbuf->data_len)) {
- byte_count =
- rte_cpu_to_be_32(sbuf->data_len);
- } else {
- /*
- * Zero length segment is treated as
- * inline segment with zero data.
- */
- byte_count = RTE_BE32(0x80000000);
- }
- /*
- * If the data segment is not at the beginning
- * of a Tx basic block (TXBB) then write the
- * byte count, else postpone the writing to
- * just before updating the control segment.
- */
- if ((uintptr_t)dseg &
- (uintptr_t)(MLX4_TXBB_SIZE - 1)) {
- /*
- * Need a barrier here before writing
- * the byte_count fields to make sure
- * that all the data is visible before
- * the byte_count field is set.
- * Otherwise, if the segment begins a
- * new cacheline, the HCA prefetcher
- * could grab the 64-byte chunk and get
- * a valid (!= 0xffffffff) byte count
- * but stale data, and end up sending
- * the wrong data.
- */
- rte_io_wmb();
- dseg->byte_count = byte_count;
- } else {
- /*
- * This data segment starts at the
- * beginning of a new TXBB, so we
- * need to postpone its byte_count
- * writing for later.
- */
- pv[pv_counter].dseg = dseg;
- pv[pv_counter++].val = byte_count;
- }
+ nr_txbbs = handle_multi_segs(buf, txq, &ctrl);
+ if (nr_txbbs < 0) {
+ elt->buf = NULL;
+ break;
}
- /* Write the first DWORD of each TXBB save earlier. */
- if (pv_counter) {
- /* Need a barrier before writing the byte_count. */
- rte_io_wmb();
- for (--pv_counter; pv_counter >= 0; pv_counter--)
- pv[pv_counter].dseg->byte_count =
- pv[pv_counter].val;
}
- /* Fill the control parameters for this packet. */
- ctrl->fence_size = (wqe_real_size >> 4) & 0x3f;
+
/*
* For raw Ethernet, the SOLICIT flag is used to indicate
* that no ICRC should be calculated.