[dpdk-dev,2/2] ixgbe:replace compiler memory barrier and rte_wmb with rte_dma_rmb and rte_dma_wmb.

Message ID BLU436-SMTP81D5E9D2435D08D37396B7BFAB0@phx.gbl (mailing list archive)
State Changes Requested, archived
Headers

Commit Message

WangDong June 28, 2015, 3:23 p.m. UTC
  ---
 drivers/net/ixgbe/ixgbe_rxtx.c     | 30 +++++++++---------------------
 drivers/net/ixgbe/ixgbe_rxtx_vec.c |  3 +++
 2 files changed, 12 insertions(+), 21 deletions(-)
  

Comments

Ananyev, Konstantin July 2, 2015, 4:19 p.m. UTC | #1
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of WangDong
> Sent: Sunday, June 28, 2015 4:23 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 2/2] ixgbe:replace compiler memory barrier and rte_wmb with rte_dma_rmb and rte_dma_wmb.
> 
> ---
>  drivers/net/ixgbe/ixgbe_rxtx.c     | 30 +++++++++---------------------
>  drivers/net/ixgbe/ixgbe_rxtx_vec.c |  3 +++
>  2 files changed, 12 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 3ace8a8..3316488 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -130,6 +130,7 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
> 
>  	/* check DD bit on threshold descriptor */
>  	status = txq->tx_ring[txq->tx_next_dd].wb.status;
> +	rte_dma_rmb();
>  	if (! (status & IXGBE_ADVTXD_STAT_DD))
>  		return 0;

Could you explain, why do we need rmb here for weak ordering model?
We don't read rest of TXD later, so nothing could be reordered here.

> 
> @@ -320,7 +321,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		txq->tx_tail = 0;
> 
>  	/* update tail pointer */
> -	rte_wmb();
> +	rte_dma_wmb();
>  	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
> 
>  	return nb_pkts;
> @@ -841,7 +842,6 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  		txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
>  	}
>  end_of_tx:
> -	rte_wmb();
> 
>  	/*
>  	 * Set the Transmit Descriptor Tail (TDT)
> @@ -849,6 +849,7 @@ end_of_tx:
>  	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
>  		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
>  		   (unsigned) tx_id, (unsigned) nb_tx);
> +	rte_dma_wmb();
>  	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
>  	txq->tx_tail = tx_id;
> 
> @@ -975,6 +976,7 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
> 
>  		/* Compute how many status bits were set */
>  		nb_dd = 0;
> +		rte_dma_rmb();

I think that's a bit too late for rmb() here.
We need to preserve order of reading all 8 statuses, so I am afraid we need to:

/* Read desc statuses backwards to avoid race condition */
-for (j = LOOK_AHEAD-1; j >= 0; --j) 
+for (j = LOOK_AHEAD-1; j >= 0; --j) {
+                     rte_dma_wmb();
                        s[j] = rxdp[j].wb.upper.status_error;
+}

>  		for (j = 0; j < LOOK_AHEAD; ++j)
>  			nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
> 
> @@ -1138,7 +1140,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		}
> 
>  		/* update tail pointer */
> -		rte_wmb();
> +		rte_dma_wmb();
>  		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, cur_free_trigger);
>  	}
> 
> @@ -1229,13 +1231,10 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		/*
>  		 * The order of operations here is important as the DD status
>  		 * bit must not be read after any other descriptor fields.
> -		 * rx_ring and rxdp are pointing to volatile data so the order
> -		 * of accesses cannot be reordered by the compiler. If they were
> -		 * not volatile, they could be reordered which could lead to
> -		 * using invalid descriptor fields when read from rxd.
>  		 */
>  		rxdp = &rx_ring[rx_id];
>  		staterr = rxdp->wb.upper.status_error;
> +		rte_dma_rmb();
>  		if (! (staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
>  			break;
>  		rxd = *rxdp;
> @@ -1373,6 +1372,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			   (unsigned) nb_rx);
>  		rx_id = (uint16_t) ((rx_id == 0) ?
>  				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
> +		rte_dma_wmb();
>  		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
>  		nb_hold = 0;
>  	}
> @@ -1494,17 +1494,6 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
> 
>  next_desc:
>  		/*
> -		 * The code in this whole file uses the volatile pointer to
> -		 * ensure the read ordering of the status and the rest of the
> -		 * descriptor fields (on the compiler level only!!!). This is so
> -		 * UGLY - why not to just use the compiler barrier instead? DPDK
> -		 * even has the rte_compiler_barrier() for that.
> -		 *
> -		 * But most importantly this is just wrong because this doesn't
> -		 * ensure memory ordering in a general case at all. For
> -		 * instance, DPDK is supposed to work on Power CPUs where
> -		 * compiler barrier may just not be enough!
> -		 *
>  		 * I tried to write only this function properly to have a
>  		 * starting point (as a part of an LRO/RSC series) but the
>  		 * compiler cursed at me when I tried to cast away the
> @@ -1519,12 +1508,11 @@ next_desc:
>  		 * TODO:
>  		 *    - Get rid of "volatile" crap and let the compiler do its
>  		 *      job.
> -		 *    - Use the proper memory barrier (rte_rmb()) to ensure the
> -		 *      memory ordering below.
>  		 */
>  		rxdp = &rx_ring[rx_id];
>  		staterr = rte_le_to_cpu_32(rxdp->wb.upper.status_error);
> 
> +		rte_dma_rmb();
>  		if (!(staterr & IXGBE_RXDADV_STAT_DD))
>  			break;
> 
> @@ -1704,7 +1692,7 @@ next_desc:
>  			   "nb_hold=%u nb_rx=%u",
>  			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
> 
> -		rte_wmb();
> +		rte_dma_wmb();
>  		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, prev_id);
>  		nb_hold = 0;
>  	}


I think you missed one more wmb() in that function:
ixgbe_recv_pkts_lro(...)
{
 ...
} else if (nb_hold > rxq->rx_free_thresh) {
                        uint16_t next_rdt = rxq->rx_free_trigger;

                        if (!ixgbe_rx_alloc_bufs(rxq, false)) {
                                rte_wmb();
                                IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr,
                                                    next_rdt);
                                nb_hold -= rxq->rx_free_thresh;
                        } else {

> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> index abd10f6..af4d779 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c

In fact, I think there is no much point to modify that one.
Vector routines use IA specific instrincts, so that code wouldn't work on any other architecture anyway.


> @@ -123,6 +123,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
>  			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
> 
>  	/* Update the tail pointer on the NIC */
> +	rte_dma_wmb();
>  	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
>  }
> 
> @@ -528,6 +529,7 @@ ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
> 
>  	/* check DD bit on threshold descriptor */
>  	status = txq->tx_ring[txq->tx_next_dd].wb.status;
> +	rte_dma_rmb();
>  	if (!(status & IXGBE_ADVTXD_STAT_DD))
>  		return 0;


Again, as with its scalar counterpart, I don't think we need rmb here.
We read only status from one TXD, that's it.
But as I said above, there is probably no need to touch that file at all.

Konstantin

> 
> @@ -645,6 +647,7 @@ ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> 
>  	txq->tx_tail = tx_id;
> 
> +	rte_dma_wmb();
>  	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
> 
>  	return nb_pkts;
> --
> 2.1.0
  

Patch

diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 3ace8a8..3316488 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -130,6 +130,7 @@  ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
 
 	/* check DD bit on threshold descriptor */
 	status = txq->tx_ring[txq->tx_next_dd].wb.status;
+	rte_dma_rmb();
 	if (! (status & IXGBE_ADVTXD_STAT_DD))
 		return 0;
 
@@ -320,7 +321,7 @@  tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		txq->tx_tail = 0;
 
 	/* update tail pointer */
-	rte_wmb();
+	rte_dma_wmb();
 	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
@@ -841,7 +842,6 @@  ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 		txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
 	}
 end_of_tx:
-	rte_wmb();
 
 	/*
 	 * Set the Transmit Descriptor Tail (TDT)
@@ -849,6 +849,7 @@  end_of_tx:
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
+	rte_dma_wmb();
 	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
@@ -975,6 +976,7 @@  ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
 
 		/* Compute how many status bits were set */
 		nb_dd = 0;
+		rte_dma_rmb();
 		for (j = 0; j < LOOK_AHEAD; ++j)
 			nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
 
@@ -1138,7 +1140,7 @@  rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		}
 
 		/* update tail pointer */
-		rte_wmb();
+		rte_dma_wmb();
 		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, cur_free_trigger);
 	}
 
@@ -1229,13 +1231,10 @@  ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 		/*
 		 * The order of operations here is important as the DD status
 		 * bit must not be read after any other descriptor fields.
-		 * rx_ring and rxdp are pointing to volatile data so the order
-		 * of accesses cannot be reordered by the compiler. If they were
-		 * not volatile, they could be reordered which could lead to
-		 * using invalid descriptor fields when read from rxd.
 		 */
 		rxdp = &rx_ring[rx_id];
 		staterr = rxdp->wb.upper.status_error;
+		rte_dma_rmb();
 		if (! (staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
 			break;
 		rxd = *rxdp;
@@ -1373,6 +1372,7 @@  ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
+		rte_dma_wmb();
 		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
@@ -1494,17 +1494,6 @@  ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 next_desc:
 		/*
-		 * The code in this whole file uses the volatile pointer to
-		 * ensure the read ordering of the status and the rest of the
-		 * descriptor fields (on the compiler level only!!!). This is so
-		 * UGLY - why not to just use the compiler barrier instead? DPDK
-		 * even has the rte_compiler_barrier() for that.
-		 *
-		 * But most importantly this is just wrong because this doesn't
-		 * ensure memory ordering in a general case at all. For
-		 * instance, DPDK is supposed to work on Power CPUs where
-		 * compiler barrier may just not be enough!
-		 *
 		 * I tried to write only this function properly to have a
 		 * starting point (as a part of an LRO/RSC series) but the
 		 * compiler cursed at me when I tried to cast away the
@@ -1519,12 +1508,11 @@  next_desc:
 		 * TODO:
 		 *    - Get rid of "volatile" crap and let the compiler do its
 		 *      job.
-		 *    - Use the proper memory barrier (rte_rmb()) to ensure the
-		 *      memory ordering below.
 		 */
 		rxdp = &rx_ring[rx_id];
 		staterr = rte_le_to_cpu_32(rxdp->wb.upper.status_error);
 
+		rte_dma_rmb();
 		if (!(staterr & IXGBE_RXDADV_STAT_DD))
 			break;
 
@@ -1704,7 +1692,7 @@  next_desc:
 			   "nb_hold=%u nb_rx=%u",
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
-		rte_wmb();
+		rte_dma_wmb();
 		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
index abd10f6..af4d779 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
@@ -123,6 +123,7 @@  ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
+	rte_dma_wmb();
 	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
@@ -528,6 +529,7 @@  ixgbe_tx_free_bufs(struct ixgbe_tx_queue *txq)
 
 	/* check DD bit on threshold descriptor */
 	status = txq->tx_ring[txq->tx_next_dd].wb.status;
+	rte_dma_rmb();
 	if (!(status & IXGBE_ADVTXD_STAT_DD))
 		return 0;
 
@@ -645,6 +647,7 @@  ixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
+	rte_dma_wmb();
 	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;