[dpdk-dev,v3,07/10] vmxnet3: support jumbo frames

Message ID 1425600635-20628-8-git-send-email-stephen@networkplumber.org (mailing list archive)
State Superseded, archived
Headers

Commit Message

Stephen Hemminger March 6, 2015, 12:10 a.m. UTC
  From: Stephen Hemminger <shemming@brocade.com>

Add support for linking multi-segment buffers together to
handle Jumbo packets.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c |  3 +-
 lib/librte_pmd_vmxnet3/vmxnet3_ring.h   |  2 +
 lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c   | 76 ++++++++++++++++++++-------------
 3 files changed, 50 insertions(+), 31 deletions(-)
  

Comments

Yong Wang March 9, 2015, 11:28 p.m. UTC | #1
On 3/5/15, 4:10 PM, "Stephen Hemminger" <stephen@networkplumber.org> wrote:

>From: Stephen Hemminger <shemming@brocade.com>

>

>Add support for linking multi-segment buffers together to

>handle Jumbo packets.

>

>Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

>---

> lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c |  3 +-

> lib/librte_pmd_vmxnet3/vmxnet3_ring.h   |  2 +

> lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c   | 76

>++++++++++++++++++++-------------

> 3 files changed, 50 insertions(+), 31 deletions(-)

>

>diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>b/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>index 35bb561..4f1bc4f 100644

>--- a/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>+++ b/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>@@ -401,6 +401,7 @@ vmxnet3_setup_driver_shared(struct rte_eth_dev *dev)

> {

> 	struct rte_eth_conf port_conf = dev->data->dev_conf;

> 	struct vmxnet3_hw *hw = dev->data->dev_private;

>+	uint32_t mtu = dev->data->mtu;

> 	Vmxnet3_DriverShared *shared = hw->shared;

> 	Vmxnet3_DSDevRead *devRead = &shared->devRead;

> 	uint32_t *mac_ptr;

>@@ -418,7 +419,7 @@ vmxnet3_setup_driver_shared(struct rte_eth_dev *dev)

> 	devRead->misc.driverInfo.vmxnet3RevSpt = 1;

> 	devRead->misc.driverInfo.uptVerSpt     = 1;

> 

>-	devRead->misc.mtu = rte_le_to_cpu_32(dev->data->mtu);

>+	devRead->misc.mtu = rte_le_to_cpu_32(mtu);


I didn’t see where mtu is used to calculate how many rx descriptors will
be needed for each packet.  Furthermore, as pointed out by the following
code comments, the device requires the first rx buffer of a packet be of
type VMXNET3_RXD_BTYPE_HEAD with the remaining buffers of type
VMXNET3_RXD_BTYPE_NODY.  This needs to be taken care of when populating rx
rings in vmxnet3_post_rx_bufs(). Currently we don’t do this because no
scatter-rx is supported and only one descriptor is needed for a packet
(thus all types should be HEAD). Otherwise, the device will complain with
error returned.  For the 2nd rx ring, type needs to be BODY for all
descriptors still.

Related to this, could you share what tests have been done to cover these
new features?

static int
vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
{
        int err = 0;
        uint32_t i = 0, val = 0;
        struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];

        if (ring_id == 0) {
	/* Usually: One HEAD type buf per packet
	* val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
	* VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
	*/

	/* We use single packet buffer so all heads here */
	val = VMXNET3_RXD_BTYPE_HEAD;
        } else {
	/* All BODY type buffers for 2nd ring */
	val = VMXNET3_RXD_BTYPE_BODY;
        }



> 	devRead->misc.queueDescPA  = hw->queueDescPA;

> 	devRead->misc.queueDescLen = hw->queue_desc_len;

> 	devRead->misc.numTxQueues  = hw->num_tx_queues;

>diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>b/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>index 612487e..55ceadf 100644

>--- a/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>+++ b/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>@@ -171,6 +171,8 @@ typedef struct vmxnet3_rx_queue {

> 	uint32_t                    qid1;

> 	uint32_t                    qid2;

> 	Vmxnet3_RxQueueDesc         *shared;

>+	struct rte_mbuf		    *start_seg;

>+	struct rte_mbuf		    *last_seg;

> 	struct vmxnet3_rxq_stats    stats;

> 	bool                        stopped;

> 	uint16_t                    queue_id;      /**< Device RX queue index.

>*/

>diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>b/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>index ba48a12..5cf187a 100644

>--- a/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>+++ b/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>@@ -571,7 +571,6 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>**rx_pkts, uint16_t nb_pkts)

> 	vmxnet3_rx_queue_t *rxq;

> 	Vmxnet3_RxCompDesc *rcd;

> 	vmxnet3_buf_info_t *rbi;

>-	Vmxnet3_RxDesc *rxd;

> 	struct rte_mbuf *rxm = NULL;

> 	struct vmxnet3_hw *hw;

> 

>@@ -596,42 +595,18 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>**rx_pkts, uint16_t nb_pkts)

> 

> 		idx = rcd->rxdIdx;

> 		ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);

>-		rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;

> 		rbi = rxq->cmd_ring[ring_idx].buf_info + idx;

> 

>-		if (unlikely(rcd->sop != 1 || rcd->eop != 1)) {

>-			rte_pktmbuf_free_seg(rbi->m);

>-			PMD_RX_LOG(DEBUG, "Packet spread across multiple buffers\n)");

>-			goto rcd_done;

>-		}

> 

> 		PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);

> 

> #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>+		Vmxnet3_RxDesc *rxd

>+			= (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;

> 		VMXNET3_ASSERT(rcd->len <= rxd->len);

> 		VMXNET3_ASSERT(rbi->m);

> #endif

>-		if (unlikely(rcd->len == 0)) {

>-			PMD_RX_LOG(DEBUG, "Rx buf was skipped. rxring[%d][%d]\n)",

>-				   ring_idx, idx);

>-#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>-			VMXNET3_ASSERT(rcd->sop && rcd->eop);

>-#endif

>-			rte_pktmbuf_free_seg(rbi->m);

>-			goto rcd_done;

>-		}

> 

>-		/* Assuming a packet is coming in a single packet buffer */

>-		if (unlikely(rxd->btype != VMXNET3_RXD_BTYPE_HEAD)) {

>-			PMD_RX_LOG(DEBUG,

>-				   "Alert : Misbehaving device, incorrect "

>-				   " buffer type used. iPacket dropped.");

>-			rte_pktmbuf_free_seg(rbi->m);

>-			goto rcd_done;

>-		}

>-#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>-		VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);

>-#endif

> 		/* Get the packet buffer pointer from buf_info */

> 		rxm = rbi->m;

> 

>@@ -643,7 +618,7 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>**rx_pkts, uint16_t nb_pkts)

> 		rxq->cmd_ring[ring_idx].next2comp = idx;

> 

> 		/* For RCD with EOP set, check if there is frame error */

>-		if (unlikely(rcd->err)) {

>+		if (unlikely(rcd->eop && rcd->err)) {

> 			rxq->stats.drop_total++;

> 			rxq->stats.drop_err++;

> 

>@@ -669,9 +644,49 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>**rx_pkts, uint16_t nb_pkts)

> 		rxm->ol_flags = 0;

> 		rxm->vlan_tci = 0;

> 

>-		vmxnet3_rx_offload(rcd, rxm);

>+		/*

>+		 * If this is the first buffer of the received packet,

>+		 * set the pointer to the first mbuf of the packet

>+		 * Otherwise, update the total length and the number of segments

>+		 * of the current scattered packet, and update the pointer to

>+		 * the last mbuf of the current packet.

>+		 */

>+		if (rcd->sop) {

>+#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>+			VMXNET3_ASSERT(!rxq->start_seg);

>+			VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);

>+#endif

>+

>+			if (unlikely(rcd->len == 0)) {

>+				PMD_RX_LOG(DEBUG,

>+					   "Rx buf was skipped. rxring[%d][%d])",

>+					   ring_idx, idx);

>+				rte_pktmbuf_free_seg(rbi->m);

>+				goto rcd_done;

>+			}

>+

>+			rxq->start_seg = rxm;

>+			vmxnet3_rx_offload(rcd, rxm);

>+		} else {

>+			struct rte_mbuf *start = rxq->start_seg;

>+

>+#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>+			VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);

>+			VMXNET3_ASSERT(start != NULL);

>+#endif

>+

>+			start->pkt_len += rxm->data_len;

>+			start->nb_segs++;

>+

>+			rxq->last_seg->next = rxm;

>+		}

>+		rxq->last_seg = rxm;

>+

>+		if (rcd->eop) {

>+			rx_pkts[nb_rx++] = rxq->start_seg;

>+			rxq->start_seg = NULL;

>+		}

> 

>-		rx_pkts[nb_rx++] = rxm;

> rcd_done:

> 		rxq->cmd_ring[ring_idx].next2comp = idx;

> 		VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp,

>rxq->cmd_ring[ring_idx].size);

>@@ -975,6 +990,7 @@ vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)

> 			}

> 		}

> 		rxq->stopped = FALSE;

>+		rxq->start_seg = NULL;

> 	}

> 

> 	for (i = 0; i < dev->data->nb_tx_queues; i++) {

>-- 

>2.1.4

>
  
Yong Wang March 9, 2015, 11:32 p.m. UTC | #2
On 3/9/15, 4:28 PM, "Yong Wang" <yongwang@vmware.com> wrote:

>On 3/5/15, 4:10 PM, "Stephen Hemminger" <stephen@networkplumber.org>

>wrote:

>

>>From: Stephen Hemminger <shemming@brocade.com>

>>

>>Add support for linking multi-segment buffers together to

>>handle Jumbo packets.

>>

>>Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

>>---

>> lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c |  3 +-

>> lib/librte_pmd_vmxnet3/vmxnet3_ring.h   |  2 +

>> lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c   | 76

>>++++++++++++++++++++-------------

>> 3 files changed, 50 insertions(+), 31 deletions(-)

>>

>>diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>>b/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>>index 35bb561..4f1bc4f 100644

>>--- a/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>>+++ b/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c

>>@@ -401,6 +401,7 @@ vmxnet3_setup_driver_shared(struct rte_eth_dev *dev)

>> {

>> 	struct rte_eth_conf port_conf = dev->data->dev_conf;

>> 	struct vmxnet3_hw *hw = dev->data->dev_private;

>>+	uint32_t mtu = dev->data->mtu;

>> 	Vmxnet3_DriverShared *shared = hw->shared;

>> 	Vmxnet3_DSDevRead *devRead = &shared->devRead;

>> 	uint32_t *mac_ptr;

>>@@ -418,7 +419,7 @@ vmxnet3_setup_driver_shared(struct rte_eth_dev *dev)

>> 	devRead->misc.driverInfo.vmxnet3RevSpt = 1;

>> 	devRead->misc.driverInfo.uptVerSpt     = 1;

>> 

>>-	devRead->misc.mtu = rte_le_to_cpu_32(dev->data->mtu);

>>+	devRead->misc.mtu = rte_le_to_cpu_32(mtu);

>

>I didn’t see where mtu is used to calculate how many rx descriptors will

>be needed for each packet.  Furthermore, as pointed out by the following

>code comments, the device requires the first rx buffer of a packet be of

>type VMXNET3_RXD_BTYPE_HEAD with the remaining buffers of type

>VMXNET3_RXD_BTYPE_NODY.  This needs to be taken care of when populating rx

>rings in vmxnet3_post_rx_bufs(). Currently we don’t do this because no

>scatter-rx is supported and only one descriptor is needed for a packet

>(thus all types should be HEAD). Otherwise, the device will complain with


To clarify, in this case only the 1st ring will be used and thus all types
will be HEAD.

>error returned.  For the 2nd rx ring, type needs to be BODY for all

>descriptors still.

>

>Related to this, could you share what tests have been done to cover these

>new features?

>

>static int

>vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)

>{

>        int err = 0;

>        uint32_t i = 0, val = 0;

>        struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];

>

>        if (ring_id == 0) {

>	/* Usually: One HEAD type buf per packet

>	* val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?

>	* VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;

>	*/

>

>	/* We use single packet buffer so all heads here */

>	val = VMXNET3_RXD_BTYPE_HEAD;

>        } else {

>	/* All BODY type buffers for 2nd ring */

>	val = VMXNET3_RXD_BTYPE_BODY;

>        }

>

>

>

>> 	devRead->misc.queueDescPA  = hw->queueDescPA;

>> 	devRead->misc.queueDescLen = hw->queue_desc_len;

>> 	devRead->misc.numTxQueues  = hw->num_tx_queues;

>>diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>>b/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>>index 612487e..55ceadf 100644

>>--- a/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>>+++ b/lib/librte_pmd_vmxnet3/vmxnet3_ring.h

>>@@ -171,6 +171,8 @@ typedef struct vmxnet3_rx_queue {

>> 	uint32_t                    qid1;

>> 	uint32_t                    qid2;

>> 	Vmxnet3_RxQueueDesc         *shared;

>>+	struct rte_mbuf		    *start_seg;

>>+	struct rte_mbuf		    *last_seg;

>> 	struct vmxnet3_rxq_stats    stats;

>> 	bool                        stopped;

>> 	uint16_t                    queue_id;      /**< Device RX queue index.

>>*/

>>diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>>b/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>>index ba48a12..5cf187a 100644

>>--- a/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>>+++ b/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c

>>@@ -571,7 +571,6 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>>**rx_pkts, uint16_t nb_pkts)

>> 	vmxnet3_rx_queue_t *rxq;

>> 	Vmxnet3_RxCompDesc *rcd;

>> 	vmxnet3_buf_info_t *rbi;

>>-	Vmxnet3_RxDesc *rxd;

>> 	struct rte_mbuf *rxm = NULL;

>> 	struct vmxnet3_hw *hw;

>> 

>>@@ -596,42 +595,18 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>>**rx_pkts, uint16_t nb_pkts)

>> 

>> 		idx = rcd->rxdIdx;

>> 		ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);

>>-		rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;

>> 		rbi = rxq->cmd_ring[ring_idx].buf_info + idx;

>> 

>>-		if (unlikely(rcd->sop != 1 || rcd->eop != 1)) {

>>-			rte_pktmbuf_free_seg(rbi->m);

>>-			PMD_RX_LOG(DEBUG, "Packet spread across multiple buffers\n)");

>>-			goto rcd_done;

>>-		}

>> 

>> 		PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);

>> 

>> #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>>+		Vmxnet3_RxDesc *rxd

>>+			= (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;

>> 		VMXNET3_ASSERT(rcd->len <= rxd->len);

>> 		VMXNET3_ASSERT(rbi->m);

>> #endif

>>-		if (unlikely(rcd->len == 0)) {

>>-			PMD_RX_LOG(DEBUG, "Rx buf was skipped. rxring[%d][%d]\n)",

>>-				   ring_idx, idx);

>>-#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>>-			VMXNET3_ASSERT(rcd->sop && rcd->eop);

>>-#endif

>>-			rte_pktmbuf_free_seg(rbi->m);

>>-			goto rcd_done;

>>-		}

>> 

>>-		/* Assuming a packet is coming in a single packet buffer */

>>-		if (unlikely(rxd->btype != VMXNET3_RXD_BTYPE_HEAD)) {

>>-			PMD_RX_LOG(DEBUG,

>>-				   "Alert : Misbehaving device, incorrect "

>>-				   " buffer type used. iPacket dropped.");

>>-			rte_pktmbuf_free_seg(rbi->m);

>>-			goto rcd_done;

>>-		}

>>-#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>>-		VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);

>>-#endif

>> 		/* Get the packet buffer pointer from buf_info */

>> 		rxm = rbi->m;

>> 

>>@@ -643,7 +618,7 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>>**rx_pkts, uint16_t nb_pkts)

>> 		rxq->cmd_ring[ring_idx].next2comp = idx;

>> 

>> 		/* For RCD with EOP set, check if there is frame error */

>>-		if (unlikely(rcd->err)) {

>>+		if (unlikely(rcd->eop && rcd->err)) {

>> 			rxq->stats.drop_total++;

>> 			rxq->stats.drop_err++;

>> 

>>@@ -669,9 +644,49 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf

>>**rx_pkts, uint16_t nb_pkts)

>> 		rxm->ol_flags = 0;

>> 		rxm->vlan_tci = 0;

>> 

>>-		vmxnet3_rx_offload(rcd, rxm);

>>+		/*

>>+		 * If this is the first buffer of the received packet,

>>+		 * set the pointer to the first mbuf of the packet

>>+		 * Otherwise, update the total length and the number of segments

>>+		 * of the current scattered packet, and update the pointer to

>>+		 * the last mbuf of the current packet.

>>+		 */

>>+		if (rcd->sop) {

>>+#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>>+			VMXNET3_ASSERT(!rxq->start_seg);

>>+			VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);

>>+#endif

>>+

>>+			if (unlikely(rcd->len == 0)) {

>>+				PMD_RX_LOG(DEBUG,

>>+					   "Rx buf was skipped. rxring[%d][%d])",

>>+					   ring_idx, idx);

>>+				rte_pktmbuf_free_seg(rbi->m);

>>+				goto rcd_done;

>>+			}

>>+

>>+			rxq->start_seg = rxm;

>>+			vmxnet3_rx_offload(rcd, rxm);

>>+		} else {

>>+			struct rte_mbuf *start = rxq->start_seg;

>>+

>>+#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER

>>+			VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);

>>+			VMXNET3_ASSERT(start != NULL);

>>+#endif

>>+

>>+			start->pkt_len += rxm->data_len;

>>+			start->nb_segs++;

>>+

>>+			rxq->last_seg->next = rxm;

>>+		}

>>+		rxq->last_seg = rxm;

>>+

>>+		if (rcd->eop) {

>>+			rx_pkts[nb_rx++] = rxq->start_seg;

>>+			rxq->start_seg = NULL;

>>+		}

>> 

>>-		rx_pkts[nb_rx++] = rxm;

>> rcd_done:

>> 		rxq->cmd_ring[ring_idx].next2comp = idx;

>> 		VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp,

>>rxq->cmd_ring[ring_idx].size);

>>@@ -975,6 +990,7 @@ vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)

>> 			}

>> 		}

>> 		rxq->stopped = FALSE;

>>+		rxq->start_seg = NULL;

>> 	}

>> 

>> 	for (i = 0; i < dev->data->nb_tx_queues; i++) {

>>-- 

>>2.1.4

>>

>
  
Stephen Hemminger March 10, 2015, 4:18 a.m. UTC | #3
On Mon, 9 Mar 2015 23:28:39 +0000
Yong Wang <yongwang@vmware.com> wrote:

> I didn’t see where mtu is used to calculate how many rx descriptors will
> be needed for each packet.  Furthermore, as pointed out by the following
> code comments, the device requires the first rx buffer of a packet be of
> type VMXNET3_RXD_BTYPE_HEAD with the remaining buffers of type
> VMXNET3_RXD_BTYPE_NODY.  This needs to be taken care of when populating rx
> rings in vmxnet3_post_rx_bufs(). Currently we don’t do this because no
> scatter-rx is supported and only one descriptor is needed for a packet
> (thus all types should be HEAD). Otherwise, the device will complain with
> error returned.  For the 2nd rx ring, type needs to be BODY for all
> descriptors still.

Yeah the reload logic needs work.

> Related to this, could you share what tests have been done to cover these
> new features?

Not really. We only test with our product which is built on top of DPDK,
not standalone.

Most of these came from figuring what features where missing from the DPDK
driver that Intel supplied. I had independently developed a much better
virtio and vmxnet3 drivers which were well tested, but DPDK maintainers
would not accept wholesale replacement of the driver. Therefore we have
been trying to selective add the bits from my driver back into the
Intel driver.
  
Stephen Hemminger March 10, 2015, 6:35 p.m. UTC | #4
On Mon, 9 Mar 2015 23:32:48 +0000
Yong Wang <yongwang@vmware.com> wrote:

> >
> >I didn’t see where mtu is used to calculate how many rx descriptors will
> >be needed for each packet.  Furthermore, as pointed out by the following
> >code comments, the device requires the first rx buffer of a packet be of
> >type VMXNET3_RXD_BTYPE_HEAD with the remaining buffers of type
> >VMXNET3_RXD_BTYPE_NODY.  This needs to be taken care of when populating rx
> >rings in vmxnet3_post_rx_bufs(). Currently we don’t do this because no
> >scatter-rx is supported and only one descriptor is needed for a packet
> >(thus all types should be HEAD). Otherwise, the device will complain with  
> 
> To clarify, in this case only the 1st ring will be used and thus all types
> will be HEAD.
> 
> >error returned.  For the 2nd rx ring, type needs to be BODY for all
> >descriptors still.

Looking in more detail, the code as submitted looks fine:

* The # of rx descriptors needed for each packet is responsibility of
  the application. Before and after this patch the # of Rx descriptors
  for each ring (head/body) is determined by vmxnet3_rx_queue_setup.
  And the driver always allocates the same number of descriptors
  for both Rx rings.

  One could argue that was a bug in the original driver, since it needlessly
  allocated mbufs for the second ring and never used them. I suppose
  as a optimization in future, the second ring could be sized as 0
  and no buffers allocated unless doing rx-scatter (this is what the
  Linux driver does. But that change is independent of this.

* Buffers are correctly reallocated as needed when consumed.

* The code works for bulk transfer and performance tests in jumbo mode.

I see no requirement to change this patch.
  
Yong Wang March 11, 2015, 1:03 a.m. UTC | #5
On 3/10/15, 11:35 AM, "Stephen Hemminger" <stephen@networkplumber.org>
wrote:

>On Mon, 9 Mar 2015 23:32:48 +0000

>Yong Wang <yongwang@vmware.com> wrote:

>

>> >

>> >I didn’t see where mtu is used to calculate how many rx descriptors

>>will

>> >be needed for each packet.  Furthermore, as pointed out by the

>>following

>> >code comments, the device requires the first rx buffer of a packet be

>>of

>> >type VMXNET3_RXD_BTYPE_HEAD with the remaining buffers of type

>> >VMXNET3_RXD_BTYPE_NODY.  This needs to be taken care of when

>>populating rx

>> >rings in vmxnet3_post_rx_bufs(). Currently we don’t do this because no

>> >scatter-rx is supported and only one descriptor is needed for a packet

>> >(thus all types should be HEAD). Otherwise, the device will complain

>>with  

>> 

>> To clarify, in this case only the 1st ring will be used and thus all

>>types

>> will be HEAD.

>> 

>> >error returned.  For the 2nd rx ring, type needs to be BODY for all

>> >descriptors still.

>

>Looking in more detail, the code as submitted looks fine:

>

>* The # of rx descriptors needed for each packet is responsibility of

>  the application. Before and after this patch the # of Rx descriptors

>  for each ring (head/body) is determined by vmxnet3_rx_queue_setup.

>  And the driver always allocates the same number of descriptors

>  for both Rx rings.


Where is vmxnet3_rx_queue_setup? It’s not in the current code base nor in
your patch and I failed to see how rx descriptors are populated.

Could you explain what do you mean by responsibility of the application?
If mtu is set to 9000, the driver needs to make sure rx buf per packet is
set to the correct size with proper type set, right?

>

>  One could argue that was a bug in the original driver, since it

>needlessly

>  allocated mbufs for the second ring and never used them. I suppose

>  as a optimization in future, the second ring could be sized as 0

>  and no buffers allocated unless doing rx-scatter (this is what the

>  Linux driver does. But that change is independent of this.


We can just keep it as is as we will need it if someone decides to support
LRO for vmxnet3.

>

>* Buffers are correctly reallocated as needed when consumed.

>

>* The code works for bulk transfer and performance tests in jumbo mode.


You mean the standalone patch you posted without the
vmxnet3_rx_queue_setup you mentioned?

>

>I see no requirement to change this patch.

>
  
Stephen Hemminger March 11, 2015, 6:54 a.m. UTC | #6
On Wed, 11 Mar 2015 01:03:37 +0000
Yong Wang <yongwang@vmware.com> wrote:

> >Looking in more detail, the code as submitted looks fine:
> >
> >* The # of rx descriptors needed for each packet is responsibility of
> >  the application. Before and after this patch the # of Rx descriptors
> >  for each ring (head/body) is determined by vmxnet3_rx_queue_setup.
> >  And the driver always allocates the same number of descriptors
> >  for both Rx rings.  
> 
> Where is vmxnet3_rx_queue_setup? It’s not in the current code base nor in
> your patch and I failed to see how rx descriptors are populated.



> Could you explain what do you mean by responsibility of the application?
> If mtu is set to 9000, the driver needs to make sure rx buf per packet is
> set to the correct size with proper type set, right?

vmxnet3_dev_rx_queue_setup is unchanged from original Intel code.

But you are right there is a bug there since it will error out
if max_rx_pkt_len > bufsize.  That error check should have been removed.
That was an oversight from the merge of our code with the DPDK/Intel
code. Just removing the check and surrounding code is all that is necessary.

After that driver will receive scattered mbufs.

None of the DPDK drivers do a good job of checking all the possible
misconfiguration of rxmode, jumbo and scattered flags. There is room
for improvement in all drivers around that.


> >  One could argue that was a bug in the original driver, since it
> >needlessly
> >  allocated mbufs for the second ring and never used them. I suppose
> >  as a optimization in future, the second ring could be sized as 0
> >  and no buffers allocated unless doing rx-scatter (this is what the
> >  Linux driver does. But that change is independent of this.  
> 
> We can just keep it as is as we will need it if someone decides to support
> LRO for vmxnet3.

But LRO must be optional since LRO is broken for the case of forwarding.
Linux driver handles the case of only allocating second ring if necessary.
Although the Linux vmxnet3 driver was broken about LRO setting for
several years, it is fixed now.


> >
> >* Buffers are correctly reallocated as needed when consumed.
> >
> >* The code works for bulk transfer and performance tests in jumbo mode.  
> 
> You mean the standalone patch you posted without the
> vmxnet3_rx_queue_setup you mentioned?

The code in recv that calls vmxnet3_post_rx_bufs works for both rings
already. It was just until the scattered logic was engaged the second
ring was never used.
  

Patch

diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c b/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c
index 35bb561..4f1bc4f 100644
--- a/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c
+++ b/lib/librte_pmd_vmxnet3/vmxnet3_ethdev.c
@@ -401,6 +401,7 @@  vmxnet3_setup_driver_shared(struct rte_eth_dev *dev)
 {
 	struct rte_eth_conf port_conf = dev->data->dev_conf;
 	struct vmxnet3_hw *hw = dev->data->dev_private;
+	uint32_t mtu = dev->data->mtu;
 	Vmxnet3_DriverShared *shared = hw->shared;
 	Vmxnet3_DSDevRead *devRead = &shared->devRead;
 	uint32_t *mac_ptr;
@@ -418,7 +419,7 @@  vmxnet3_setup_driver_shared(struct rte_eth_dev *dev)
 	devRead->misc.driverInfo.vmxnet3RevSpt = 1;
 	devRead->misc.driverInfo.uptVerSpt     = 1;
 
-	devRead->misc.mtu = rte_le_to_cpu_32(dev->data->mtu);
+	devRead->misc.mtu = rte_le_to_cpu_32(mtu);
 	devRead->misc.queueDescPA  = hw->queueDescPA;
 	devRead->misc.queueDescLen = hw->queue_desc_len;
 	devRead->misc.numTxQueues  = hw->num_tx_queues;
diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_ring.h b/lib/librte_pmd_vmxnet3/vmxnet3_ring.h
index 612487e..55ceadf 100644
--- a/lib/librte_pmd_vmxnet3/vmxnet3_ring.h
+++ b/lib/librte_pmd_vmxnet3/vmxnet3_ring.h
@@ -171,6 +171,8 @@  typedef struct vmxnet3_rx_queue {
 	uint32_t                    qid1;
 	uint32_t                    qid2;
 	Vmxnet3_RxQueueDesc         *shared;
+	struct rte_mbuf		    *start_seg;
+	struct rte_mbuf		    *last_seg;
 	struct vmxnet3_rxq_stats    stats;
 	bool                        stopped;
 	uint16_t                    queue_id;      /**< Device RX queue index. */
diff --git a/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c b/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c
index ba48a12..5cf187a 100644
--- a/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c
+++ b/lib/librte_pmd_vmxnet3/vmxnet3_rxtx.c
@@ -571,7 +571,6 @@  vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	vmxnet3_rx_queue_t *rxq;
 	Vmxnet3_RxCompDesc *rcd;
 	vmxnet3_buf_info_t *rbi;
-	Vmxnet3_RxDesc *rxd;
 	struct rte_mbuf *rxm = NULL;
 	struct vmxnet3_hw *hw;
 
@@ -596,42 +595,18 @@  vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 
 		idx = rcd->rxdIdx;
 		ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
-		rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
 		rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
 
-		if (unlikely(rcd->sop != 1 || rcd->eop != 1)) {
-			rte_pktmbuf_free_seg(rbi->m);
-			PMD_RX_LOG(DEBUG, "Packet spread across multiple buffers\n)");
-			goto rcd_done;
-		}
 
 		PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
 
 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER
+		Vmxnet3_RxDesc *rxd
+			= (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
 		VMXNET3_ASSERT(rcd->len <= rxd->len);
 		VMXNET3_ASSERT(rbi->m);
 #endif
-		if (unlikely(rcd->len == 0)) {
-			PMD_RX_LOG(DEBUG, "Rx buf was skipped. rxring[%d][%d]\n)",
-				   ring_idx, idx);
-#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER
-			VMXNET3_ASSERT(rcd->sop && rcd->eop);
-#endif
-			rte_pktmbuf_free_seg(rbi->m);
-			goto rcd_done;
-		}
 
-		/* Assuming a packet is coming in a single packet buffer */
-		if (unlikely(rxd->btype != VMXNET3_RXD_BTYPE_HEAD)) {
-			PMD_RX_LOG(DEBUG,
-				   "Alert : Misbehaving device, incorrect "
-				   " buffer type used. iPacket dropped.");
-			rte_pktmbuf_free_seg(rbi->m);
-			goto rcd_done;
-		}
-#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER
-		VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
-#endif
 		/* Get the packet buffer pointer from buf_info */
 		rxm = rbi->m;
 
@@ -643,7 +618,7 @@  vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxq->cmd_ring[ring_idx].next2comp = idx;
 
 		/* For RCD with EOP set, check if there is frame error */
-		if (unlikely(rcd->err)) {
+		if (unlikely(rcd->eop && rcd->err)) {
 			rxq->stats.drop_total++;
 			rxq->stats.drop_err++;
 
@@ -669,9 +644,49 @@  vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		rxm->ol_flags = 0;
 		rxm->vlan_tci = 0;
 
-		vmxnet3_rx_offload(rcd, rxm);
+		/*
+		 * If this is the first buffer of the received packet,
+		 * set the pointer to the first mbuf of the packet
+		 * Otherwise, update the total length and the number of segments
+		 * of the current scattered packet, and update the pointer to
+		 * the last mbuf of the current packet.
+		 */
+		if (rcd->sop) {
+#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER
+			VMXNET3_ASSERT(!rxq->start_seg);
+			VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
+#endif
+
+			if (unlikely(rcd->len == 0)) {
+				PMD_RX_LOG(DEBUG,
+					   "Rx buf was skipped. rxring[%d][%d])",
+					   ring_idx, idx);
+				rte_pktmbuf_free_seg(rbi->m);
+				goto rcd_done;
+			}
+
+			rxq->start_seg = rxm;
+			vmxnet3_rx_offload(rcd, rxm);
+		} else {
+			struct rte_mbuf *start = rxq->start_seg;
+
+#ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER
+			VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);
+			VMXNET3_ASSERT(start != NULL);
+#endif
+
+			start->pkt_len += rxm->data_len;
+			start->nb_segs++;
+
+			rxq->last_seg->next = rxm;
+		}
+		rxq->last_seg = rxm;
+
+		if (rcd->eop) {
+			rx_pkts[nb_rx++] = rxq->start_seg;
+			rxq->start_seg = NULL;
+		}
 
-		rx_pkts[nb_rx++] = rxm;
 rcd_done:
 		rxq->cmd_ring[ring_idx].next2comp = idx;
 		VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp, rxq->cmd_ring[ring_idx].size);
@@ -975,6 +990,7 @@  vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
 			}
 		}
 		rxq->stopped = FALSE;
+		rxq->start_seg = NULL;
 	}
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {