diff mbox series

[v4,7/9] net/gve: add support for Rx/Tx

Message ID 20220927073255.1803892-8-junfeng.guo@intel.com (mailing list archive)
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers show
Series introduce GVE PMD | expand

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Guo, Junfeng Sept. 27, 2022, 7:32 a.m. UTC
Add Rx/Tx of GQI_QPL queue format and GQI_RDA queue format.

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
---
 doc/guides/nics/features/gve.ini |   2 +
 drivers/net/gve/gve_ethdev.c     |   5 +
 drivers/net/gve/gve_ethdev.h     |  16 ++
 drivers/net/gve/gve_rx.c         | 143 ++++++++++
 drivers/net/gve/gve_tx.c         | 455 +++++++++++++++++++++++++++++++
 5 files changed, 621 insertions(+)

Comments

Ferruh Yigit Oct. 6, 2022, 2:24 p.m. UTC | #1
On 9/27/2022 8:32 AM, Junfeng Guo wrote:

> 
> Add Rx/Tx of GQI_QPL queue format and GQI_RDA queue format.
> 
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>

<...>

> --- a/drivers/net/gve/gve_ethdev.c
> +++ b/drivers/net/gve/gve_ethdev.c
> @@ -583,6 +583,11 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
>          if (err)
>                  return err;
> 
> +       if (gve_is_gqi(priv)) {
> +               eth_dev->rx_pkt_burst = gve_rx_burst;
> +               eth_dev->tx_pkt_burst = gve_tx_burst;
> +       }
> +

What do you think to add a log here for 'else' case, to inform user why 
datapath is not working?

<...>

> +uint16_t
> +gve_rx_burst(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
> +{
> +       volatile struct gve_rx_desc *rxr, *rxd;
> +       struct gve_rx_queue *rxq = rx_queue;
> +       uint16_t rx_id = rxq->rx_tail;
> +       struct rte_mbuf *rxe;
> +       uint16_t nb_rx, len;
> +       uint64_t addr;
> +
> +       rxr = rxq->rx_desc_ring;
> +
> +       for (nb_rx = 0; nb_rx < nb_pkts; nb_rx++) {
> +               rxd = &rxr[rx_id];
> +               if (GVE_SEQNO(rxd->flags_seq) != rxq->expected_seqno)
> +                       break;
> +
> +               if (rxd->flags_seq & GVE_RXF_ERR)
> +                       continue;
> +
> +               len = rte_be_to_cpu_16(rxd->len) - GVE_RX_PAD;
> +               rxe = rxq->sw_ring[rx_id];
> +               rxe->data_off = RTE_PKTMBUF_HEADROOM;
> +               if (rxq->is_gqi_qpl) {
> +                       addr = (uint64_t)(rxq->qpl->mz->addr) + rx_id * PAGE_SIZE + GVE_RX_PAD;
> +                       rte_memcpy((void *)((size_t)rxe->buf_addr + rxe->data_off),
> +                                  (void *)(size_t)addr, len);

Why a 'memcpy' is needed? Can't it DMA to mbuf data buffer?

> +               }
> +               rxe->nb_segs = 1;
> +               rxe->next = NULL;
> +               rxe->pkt_len = len;
> +               rxe->data_len = len;
> +               rxe->port = rxq->port_id;
> +               rxe->packet_type = 0;
> +               rxe->ol_flags = 0;
> +

As far as I can see 'sw_ring[]' filled using 'rte_pktmbuf_alloc_bulk()' 
API, which should reset mbuf fields to default values, so some of the 
assignment above can be redundant.

> +               if (rxd->flags_seq & GVE_RXF_TCP)
> +                       rxe->packet_type |= RTE_PTYPE_L4_TCP;
> +               if (rxd->flags_seq & GVE_RXF_UDP)
> +                       rxe->packet_type |= RTE_PTYPE_L4_UDP;
> +               if (rxd->flags_seq & GVE_RXF_IPV4)
> +                       rxe->packet_type |= RTE_PTYPE_L3_IPV4;
> +               if (rxd->flags_seq & GVE_RXF_IPV6)
> +                       rxe->packet_type |= RTE_PTYPE_L3_IPV6;
> +

If you are setting packet_type, it is better to implement 
'dev_supported_ptypes_get()' dev_ops too, to announce host which packet 
type parsin supporting. (+ dev_ptypes_set() dev_ops)
And later driver can announce "Packet type parsing" feature in .ini file.
Guo, Junfeng Oct. 9, 2022, 9:14 a.m. UTC | #2
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@amd.com>
> Sent: Thursday, October 6, 2022 22:25
> To: Guo, Junfeng <junfeng.guo@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: ferruh.yigit@xilinx.com; dev@dpdk.org; Li, Xiaoyun
> <xiaoyun.li@intel.com>; awogbemila@google.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Lin, Xueqin <xueqin.lin@intel.com>
> Subject: Re: [PATCH v4 7/9] net/gve: add support for Rx/Tx
> 
> On 9/27/2022 8:32 AM, Junfeng Guo wrote:
> 
> >
> > Add Rx/Tx of GQI_QPL queue format and GQI_RDA queue format.
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
> 
> <...>
> 
> > --- a/drivers/net/gve/gve_ethdev.c
> > +++ b/drivers/net/gve/gve_ethdev.c
> > @@ -583,6 +583,11 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
> >          if (err)
> >                  return err;
> >
> > +       if (gve_is_gqi(priv)) {
> > +               eth_dev->rx_pkt_burst = gve_rx_burst;
> > +               eth_dev->tx_pkt_burst = gve_tx_burst;
> > +       }
> > +
> 
> What do you think to add a log here for 'else' case, to inform user why
> datapath is not working?

Agreed, make sense!
Currently only one queue mode (i.e., qpl mode) is supported on the GCP
env. Will add a log to inform this in the else case. Thanks!

> 
> <...>
> 
> > +uint16_t
> > +gve_rx_burst(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t
> nb_pkts)
> > +{
> > +       volatile struct gve_rx_desc *rxr, *rxd;
> > +       struct gve_rx_queue *rxq = rx_queue;
> > +       uint16_t rx_id = rxq->rx_tail;
> > +       struct rte_mbuf *rxe;
> > +       uint16_t nb_rx, len;
> > +       uint64_t addr;
> > +
> > +       rxr = rxq->rx_desc_ring;
> > +
> > +       for (nb_rx = 0; nb_rx < nb_pkts; nb_rx++) {
> > +               rxd = &rxr[rx_id];
> > +               if (GVE_SEQNO(rxd->flags_seq) != rxq->expected_seqno)
> > +                       break;
> > +
> > +               if (rxd->flags_seq & GVE_RXF_ERR)
> > +                       continue;
> > +
> > +               len = rte_be_to_cpu_16(rxd->len) - GVE_RX_PAD;
> > +               rxe = rxq->sw_ring[rx_id];
> > +               rxe->data_off = RTE_PKTMBUF_HEADROOM;
> > +               if (rxq->is_gqi_qpl) {
> > +                       addr = (uint64_t)(rxq->qpl->mz->addr) + rx_id * PAGE_SIZE
> + GVE_RX_PAD;
> > +                       rte_memcpy((void *)((size_t)rxe->buf_addr + rxe-
> >data_off),
> > +                                  (void *)(size_t)addr, len);
> 
> Why a 'memcpy' is needed? Can't it DMA to mbuf data buffer?

Well, only qpl (queue page list) mode supported on the GCP env now.
So the DMA may not be used in current case.

> 
> > +               }
> > +               rxe->nb_segs = 1;
> > +               rxe->next = NULL;
> > +               rxe->pkt_len = len;
> > +               rxe->data_len = len;
> > +               rxe->port = rxq->port_id;
> > +               rxe->packet_type = 0;
> > +               rxe->ol_flags = 0;
> > +
> 
> As far as I can see 'sw_ring[]' filled using 'rte_pktmbuf_alloc_bulk()'
> API, which should reset mbuf fields to default values, so some of the
> assignment above can be redundant.

Yes, some fields are already assigned at 'rte_pktmbuf_reset()'.
Will remove the redundant ones in the coming version. Thanks! 

> 
> > +               if (rxd->flags_seq & GVE_RXF_TCP)
> > +                       rxe->packet_type |= RTE_PTYPE_L4_TCP;
> > +               if (rxd->flags_seq & GVE_RXF_UDP)
> > +                       rxe->packet_type |= RTE_PTYPE_L4_UDP;
> > +               if (rxd->flags_seq & GVE_RXF_IPV4)
> > +                       rxe->packet_type |= RTE_PTYPE_L3_IPV4;
> > +               if (rxd->flags_seq & GVE_RXF_IPV6)
> > +                       rxe->packet_type |= RTE_PTYPE_L3_IPV6;
> > +
> 
> If you are setting packet_type, it is better to implement
> 'dev_supported_ptypes_get()' dev_ops too, to announce host which
> packet
> type parsin supporting. (+ dev_ptypes_set() dev_ops)
> And later driver can announce "Packet type parsing" feature in .ini file.

Well, on current GCP env, the APIs for supported ptypes get/set have not
been exposed even in the base code. The only one in the base code is for
the dqo mode (gve_adminq_get_ptype_map_dqo). But this also cannot
be used on current GCP env. We can only implement this once they are
supported and exposed at GCP. Thanks!
Li, Xiaoyun Oct. 10, 2022, 9:39 a.m. UTC | #3
Hi

> -----Original Message-----
> From: Guo, Junfeng <junfeng.guo@intel.com>
> Sent: Sunday, October 9, 2022 10:15
> To: Ferruh Yigit <ferruh.yigit@amd.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> Cc: ferruh.yigit@xilinx.com; dev@dpdk.org; Li, Xiaoyun
> <xiaoyun.li@intel.com>; awogbemila@google.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Lin, Xueqin <xueqin.lin@intel.com>
> Subject: RE: [PATCH v4 7/9] net/gve: add support for Rx/Tx
> 
> 
> 
> > -----Original Message-----
> > From: Ferruh Yigit <ferruh.yigit@amd.com>
> > Sent: Thursday, October 6, 2022 22:25
> > To: Guo, Junfeng <junfeng.guo@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> > Cc: ferruh.yigit@xilinx.com; dev@dpdk.org; Li, Xiaoyun
> > <xiaoyun.li@intel.com>; awogbemila@google.com; Richardson, Bruce
> > <bruce.richardson@intel.com>; Lin, Xueqin <xueqin.lin@intel.com>
> > Subject: Re: [PATCH v4 7/9] net/gve: add support for Rx/Tx
> >
> > On 9/27/2022 8:32 AM, Junfeng Guo wrote:
> >
> > >
> > > Add Rx/Tx of GQI_QPL queue format and GQI_RDA queue format.
> > >
> > > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > > Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
> >
> > <...>
> >
> > > --- a/drivers/net/gve/gve_ethdev.c
> > > +++ b/drivers/net/gve/gve_ethdev.c
> > > @@ -583,6 +583,11 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
> > >          if (err)
> > >                  return err;
> > >
> > > +       if (gve_is_gqi(priv)) {
> > > +               eth_dev->rx_pkt_burst = gve_rx_burst;
> > > +               eth_dev->tx_pkt_burst = gve_tx_burst;
> > > +       }
> > > +
> >
> > What do you think to add a log here for 'else' case, to inform user
> > why datapath is not working?
> 
> Agreed, make sense!
> Currently only one queue mode (i.e., qpl mode) is supported on the GCP env.
> Will add a log to inform this in the else case. Thanks!

This explanation is not correct. Only QPL mode is supported in GCP now. This is env limitation but not related to the else code here.
gve_is_gqi() includes two modes GQI_QPL and GQI_RDA. And both of these datapath is supported in rxtx.
GQI means its queue model is single queue model (txq for tx and rxq for rx). And there're 2 ways for this queue model QPL and RDA.
QPL needs to copy packets from/to several reserved pages negotiated with backend. RDA is just like normal device and uses PA in descs.

The datapath not supported is DQO_RDA which uses different hardware so different queue model (split/double queue model). Tx will use txq and tx_completion_q and Rx will use rxq and rx_completion_q.
This is not implemented in the datapath for now and will be implemented in the future.

So if you want to add comment here. Please say "DQO_RDA is not implemented and will be added in the future". Don't say it's not available in GCP env which is not the reason.

> 
> >
> > <...>
> >
> > > +uint16_t
> > > +gve_rx_burst(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t
> > nb_pkts)
> > > +{
> > > +       volatile struct gve_rx_desc *rxr, *rxd;
> > > +       struct gve_rx_queue *rxq = rx_queue;
> > > +       uint16_t rx_id = rxq->rx_tail;
> > > +       struct rte_mbuf *rxe;
> > > +       uint16_t nb_rx, len;
> > > +       uint64_t addr;
> > > +
> > > +       rxr = rxq->rx_desc_ring;
> > > +
> > > +       for (nb_rx = 0; nb_rx < nb_pkts; nb_rx++) {
> > > +               rxd = &rxr[rx_id];
> > > +               if (GVE_SEQNO(rxd->flags_seq) != rxq->expected_seqno)
> > > +                       break;
> > > +
> > > +               if (rxd->flags_seq & GVE_RXF_ERR)
> > > +                       continue;
> > > +
> > > +               len = rte_be_to_cpu_16(rxd->len) - GVE_RX_PAD;
> > > +               rxe = rxq->sw_ring[rx_id];
> > > +               rxe->data_off = RTE_PKTMBUF_HEADROOM;
> > > +               if (rxq->is_gqi_qpl) {
> > > +                       addr = (uint64_t)(rxq->qpl->mz->addr) +
> > > + rx_id * PAGE_SIZE
> > + GVE_RX_PAD;
> > > +                       rte_memcpy((void *)((size_t)rxe->buf_addr +
> > > + rxe-
> > >data_off),
> > > +                                  (void *)(size_t)addr, len);
> >
> > Why a 'memcpy' is needed? Can't it DMA to mbuf data buffer?

When queue model is gpi_qpl (this is negotiated and gotten using adminq with backend), the device needs to register a block of memory (called page list). And tx needs to copy the packets to this memory and rx will get packets from this area.
Backend will be responsible for getting(tx)/giving(rx) packets from this memory to the device/line (We don't really know how backend does this).
Please refer to https://www.kernel.org/doc/html/v5.4/networking/device_drivers/google/gve.html. There's a bit more explanation about this queue format.

> 
> Well, only qpl (queue page list) mode supported on the GCP env now.
> So the DMA may not be used in current case.

And yes, it's because GCP doesn't support GQI_RDA for now so GQI_QPL has to be implemented. But even if GCP env supports RDA in the future, unless they completely remove QPL support, QPL is still needed.
Because queue format/model is getting from backend through gve_adminq_describe_device(). You may just get the QPI version. The device can't really control which queue format to get.

> 
> >
> > > +               }
> > > +               rxe->nb_segs = 1;
> > > +               rxe->next = NULL;
> > > +               rxe->pkt_len = len;
> > > +               rxe->data_len = len;
> > > +               rxe->port = rxq->port_id;
> > > +               rxe->packet_type = 0;
> > > +               rxe->ol_flags = 0;
> > > +
> >
> > As far as I can see 'sw_ring[]' filled using 'rte_pktmbuf_alloc_bulk()'
> > API, which should reset mbuf fields to default values, so some of the
> > assignment above can be redundant.
> 
> Yes, some fields are already assigned at 'rte_pktmbuf_reset()'.
> Will remove the redundant ones in the coming version. Thanks!
> 
> >
> > > +               if (rxd->flags_seq & GVE_RXF_TCP)
> > > +                       rxe->packet_type |= RTE_PTYPE_L4_TCP;
> > > +               if (rxd->flags_seq & GVE_RXF_UDP)
> > > +                       rxe->packet_type |= RTE_PTYPE_L4_UDP;
> > > +               if (rxd->flags_seq & GVE_RXF_IPV4)
> > > +                       rxe->packet_type |= RTE_PTYPE_L3_IPV4;
> > > +               if (rxd->flags_seq & GVE_RXF_IPV6)
> > > +                       rxe->packet_type |= RTE_PTYPE_L3_IPV6;
> > > +
> >
> > If you are setting packet_type, it is better to implement
> > 'dev_supported_ptypes_get()' dev_ops too, to announce host which
> > packet type parsin supporting. (+ dev_ptypes_set() dev_ops) And later
> > driver can announce "Packet type parsing" feature in .ini file.
> 
> Well, on current GCP env, the APIs for supported ptypes get/set have not
> been exposed even in the base code. The only one in the base code is for
> the dqo mode (gve_adminq_get_ptype_map_dqo). But this also cannot be
> used on current GCP env. We can only implement this once they are
> supported and exposed at GCP. Thanks!

You're mixing the concept again. GCP env only supports QPL is not an excuse.
The packet type is supported even in QPL. It's just very limited to L4_TCP/UDP and L3_IPV4/6. Ptypes_get is possible and it'll be RTE_PTYPE_L3_IPV4/6 and RTE_PTYPE_L4_UDP/TCP.
For DQO mode you mentioned, it'll be more flexible and have more support. I'm not sure what's your plan but it can be implemented whenever based on the plan not GCP env availability. The base code is there. It's just you may not be able to timely verify and debug it.

Ptype_set is not supported since the hardware doesn't support it (There's no such adminq).
Guo, Junfeng Oct. 10, 2022, 10:18 a.m. UTC | #4
Thanks Xiaoyun for helping explain, it helps a lot!

> -----Original Message-----
> From: Li, Xiaoyun <xiaoyun.li@intel.com>
> Sent: Monday, October 10, 2022 17:40
> To: Guo, Junfeng <junfeng.guo@intel.com>; Ferruh Yigit
> <ferruh.yigit@amd.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> Jingjing <jingjing.wu@intel.com>
> Cc: ferruh.yigit@xilinx.com; dev@dpdk.org; awogbemila@google.com;
> Richardson, Bruce <bruce.richardson@intel.com>; Lin, Xueqin
> <xueqin.lin@intel.com>
> Subject: RE: [PATCH v4 7/9] net/gve: add support for Rx/Tx
> 
> Hi
> 
> > -----Original Message-----
> > From: Guo, Junfeng <junfeng.guo@intel.com>
> > Sent: Sunday, October 9, 2022 10:15
> > To: Ferruh Yigit <ferruh.yigit@amd.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> > Cc: ferruh.yigit@xilinx.com; dev@dpdk.org; Li, Xiaoyun
> > <xiaoyun.li@intel.com>; awogbemila@google.com; Richardson, Bruce
> > <bruce.richardson@intel.com>; Lin, Xueqin <xueqin.lin@intel.com>
> > Subject: RE: [PATCH v4 7/9] net/gve: add support for Rx/Tx
> >
> >
> >
> > > -----Original Message-----
> > > From: Ferruh Yigit <ferruh.yigit@amd.com>
> > > Sent: Thursday, October 6, 2022 22:25
> > > To: Guo, Junfeng <junfeng.guo@intel.com>; Zhang, Qi Z
> > > <qi.z.zhang@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>
> > > Cc: ferruh.yigit@xilinx.com; dev@dpdk.org; Li, Xiaoyun
> > > <xiaoyun.li@intel.com>; awogbemila@google.com; Richardson,
> Bruce
> > > <bruce.richardson@intel.com>; Lin, Xueqin <xueqin.lin@intel.com>
> > > Subject: Re: [PATCH v4 7/9] net/gve: add support for Rx/Tx
> > >
> > > On 9/27/2022 8:32 AM, Junfeng Guo wrote:
> > >
> > > >
> > > > Add Rx/Tx of GQI_QPL queue format and GQI_RDA queue format.
> > > >
> > > > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > > > Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
> > >
> > > <...>
> > >
> > > > --- a/drivers/net/gve/gve_ethdev.c
> > > > +++ b/drivers/net/gve/gve_ethdev.c
> > > > @@ -583,6 +583,11 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
> > > >          if (err)
> > > >                  return err;
> > > >
> > > > +       if (gve_is_gqi(priv)) {
> > > > +               eth_dev->rx_pkt_burst = gve_rx_burst;
> > > > +               eth_dev->tx_pkt_burst = gve_tx_burst;
> > > > +       }
> > > > +
> > >
> > > What do you think to add a log here for 'else' case, to inform user
> > > why datapath is not working?
> >
> > Agreed, make sense!
> > Currently only one queue mode (i.e., qpl mode) is supported on the GCP
> env.
> > Will add a log to inform this in the else case. Thanks!
> 
> This explanation is not correct. Only QPL mode is supported in GCP now.
> This is env limitation but not related to the else code here.
> gve_is_gqi() includes two modes GQI_QPL and GQI_RDA. And both of
> these datapath is supported in rxtx.
> GQI means its queue model is single queue model (txq for tx and rxq for
> rx). And there're 2 ways for this queue model QPL and RDA.
> QPL needs to copy packets from/to several reserved pages negotiated
> with backend. RDA is just like normal device and uses PA in descs.
> 
> The datapath not supported is DQO_RDA which uses different hardware
> so different queue model (split/double queue model). Tx will use txq and
> tx_completion_q and Rx will use rxq and rx_completion_q.
> This is not implemented in the datapath for now and will be implemented
> in the future.
> 
> So if you want to add comment here. Please say "DQO_RDA is not
> implemented and will be added in the future". Don't say it's not available
> in GCP env which is not the reason.

Okay, will add this in the coming version. Thanks!

> 
> >
> > >
> > > <...>
> > >
> > > > +uint16_t
> > > > +gve_rx_burst(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t
> > > nb_pkts)
> > > > +{
> > > > +       volatile struct gve_rx_desc *rxr, *rxd;
> > > > +       struct gve_rx_queue *rxq = rx_queue;
> > > > +       uint16_t rx_id = rxq->rx_tail;
> > > > +       struct rte_mbuf *rxe;
> > > > +       uint16_t nb_rx, len;
> > > > +       uint64_t addr;
> > > > +
> > > > +       rxr = rxq->rx_desc_ring;
> > > > +
> > > > +       for (nb_rx = 0; nb_rx < nb_pkts; nb_rx++) {
> > > > +               rxd = &rxr[rx_id];
> > > > +               if (GVE_SEQNO(rxd->flags_seq) != rxq->expected_seqno)
> > > > +                       break;
> > > > +
> > > > +               if (rxd->flags_seq & GVE_RXF_ERR)
> > > > +                       continue;
> > > > +
> > > > +               len = rte_be_to_cpu_16(rxd->len) - GVE_RX_PAD;
> > > > +               rxe = rxq->sw_ring[rx_id];
> > > > +               rxe->data_off = RTE_PKTMBUF_HEADROOM;
> > > > +               if (rxq->is_gqi_qpl) {
> > > > +                       addr = (uint64_t)(rxq->qpl->mz->addr) +
> > > > + rx_id * PAGE_SIZE
> > > + GVE_RX_PAD;
> > > > +                       rte_memcpy((void *)((size_t)rxe->buf_addr +
> > > > + rxe-
> > > >data_off),
> > > > +                                  (void *)(size_t)addr, len);
> > >
> > > Why a 'memcpy' is needed? Can't it DMA to mbuf data buffer?
> 
> When queue model is gpi_qpl (this is negotiated and gotten using adminq
> with backend), the device needs to register a block of memory (called
> page list). And tx needs to copy the packets to this memory and rx will get
> packets from this area.
> Backend will be responsible for getting(tx)/giving(rx) packets from this
> memory to the device/line (We don't really know how backend does this).
> Please refer to
> https://www.kernel.org/doc/html/v5.4/networking/device_drivers/googl
> e/gve.html. There's a bit more explanation about this queue format.
> 
> >
> > Well, only qpl (queue page list) mode supported on the GCP env now.
> > So the DMA may not be used in current case.
> 
> And yes, it's because GCP doesn't support GQI_RDA for now so GQI_QPL
> has to be implemented. But even if GCP env supports RDA in the future,
> unless they completely remove QPL support, QPL is still needed.
> Because queue format/model is getting from backend through
> gve_adminq_describe_device(). You may just get the QPI version. The
> device can't really control which queue format to get.

Thanks for the explanation!

> 
> >
> > >
> > > > +               }
> > > > +               rxe->nb_segs = 1;
> > > > +               rxe->next = NULL;
> > > > +               rxe->pkt_len = len;
> > > > +               rxe->data_len = len;
> > > > +               rxe->port = rxq->port_id;
> > > > +               rxe->packet_type = 0;
> > > > +               rxe->ol_flags = 0;
> > > > +
> > >
> > > As far as I can see 'sw_ring[]' filled using 'rte_pktmbuf_alloc_bulk()'
> > > API, which should reset mbuf fields to default values, so some of the
> > > assignment above can be redundant.
> >
> > Yes, some fields are already assigned at 'rte_pktmbuf_reset()'.
> > Will remove the redundant ones in the coming version. Thanks!
> >
> > >
> > > > +               if (rxd->flags_seq & GVE_RXF_TCP)
> > > > +                       rxe->packet_type |= RTE_PTYPE_L4_TCP;
> > > > +               if (rxd->flags_seq & GVE_RXF_UDP)
> > > > +                       rxe->packet_type |= RTE_PTYPE_L4_UDP;
> > > > +               if (rxd->flags_seq & GVE_RXF_IPV4)
> > > > +                       rxe->packet_type |= RTE_PTYPE_L3_IPV4;
> > > > +               if (rxd->flags_seq & GVE_RXF_IPV6)
> > > > +                       rxe->packet_type |= RTE_PTYPE_L3_IPV6;
> > > > +
> > >
> > > If you are setting packet_type, it is better to implement
> > > 'dev_supported_ptypes_get()' dev_ops too, to announce host which
> > > packet type parsin supporting. (+ dev_ptypes_set() dev_ops) And later
> > > driver can announce "Packet type parsing" feature in .ini file.
> >
> > Well, on current GCP env, the APIs for supported ptypes get/set have
> not
> > been exposed even in the base code. The only one in the base code is
> for
> > the dqo mode (gve_adminq_get_ptype_map_dqo). But this also cannot
> be
> > used on current GCP env. We can only implement this once they are
> > supported and exposed at GCP. Thanks!
> 
> You're mixing the concept again. GCP env only supports QPL is not an
> excuse.
> The packet type is supported even in QPL. It's just very limited to
> L4_TCP/UDP and L3_IPV4/6. Ptypes_get is possible and it'll be
> RTE_PTYPE_L3_IPV4/6 and RTE_PTYPE_L4_UDP/TCP.
> For DQO mode you mentioned, it'll be more flexible and have more
> support. I'm not sure what's your plan but it can be implemented
> whenever based on the plan not GCP env availability. The base code is
> there. It's just you may not be able to timely verify and debug it.
> 
> Ptype_set is not supported since the hardware doesn't support it (There's
> no such adminq).

Okay... no much bandwidth to implement at this point.
Maybe next release, thanks!
diff mbox series

Patch

diff --git a/doc/guides/nics/features/gve.ini b/doc/guides/nics/features/gve.ini
index fbff0a5462..38dc7024d6 100644
--- a/doc/guides/nics/features/gve.ini
+++ b/doc/guides/nics/features/gve.ini
@@ -7,6 +7,8 @@ 
 Speed capabilities   = Y
 Link status          = Y
 MTU update           = Y
+TSO                  = Y
+L4 checksum offload  = Y
 Linux                = Y
 x86-32               = Y
 x86-64               = Y
diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index 7a3695aec1..0aae447b9b 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -583,6 +583,11 @@  gve_dev_init(struct rte_eth_dev *eth_dev)
 	if (err)
 		return err;
 
+	if (gve_is_gqi(priv)) {
+		eth_dev->rx_pkt_burst = gve_rx_burst;
+		eth_dev->tx_pkt_burst = gve_tx_burst;
+	}
+
 	eth_dev->data->mac_addrs = rte_zmalloc("gve_mac", sizeof(struct rte_ether_addr), 0);
 	if (!eth_dev->data->mac_addrs) {
 		PMD_DRV_LOG(ERR, "Failed to allocate memory to store mac address");
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b0391f7df5..502ba88dc3 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -34,6 +34,18 @@  union gve_tx_desc {
 	struct gve_tx_seg_desc seg; /* subsequent descs for a packet */
 };
 
+/* Offload features */
+union gve_tx_offload {
+	uint64_t data;
+	struct {
+		uint64_t l2_len:7; /* L2 (MAC) Header Length. */
+		uint64_t l3_len:9; /* L3 (IP) Header Length. */
+		uint64_t l4_len:8; /* L4 Header Length. */
+		uint64_t tso_segsz:16; /* TCP TSO segment size */
+		/* uint64_t unused : 24; */
+	};
+};
+
 struct gve_tx_iovec {
 	uint32_t iov_base; /* offset in fifo */
 	uint32_t iov_len;
@@ -270,4 +282,8 @@  void gve_stop_tx_queues(struct rte_eth_dev *dev);
 
 void gve_stop_rx_queues(struct rte_eth_dev *dev);
 
+uint16_t gve_rx_burst(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
+
+uint16_t gve_tx_burst(void *txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts);
+
 #endif /* _GVE_ETHDEV_H_ */
diff --git a/drivers/net/gve/gve_rx.c b/drivers/net/gve/gve_rx.c
index e64a461253..3634a2762f 100644
--- a/drivers/net/gve/gve_rx.c
+++ b/drivers/net/gve/gve_rx.c
@@ -5,6 +5,149 @@ 
 #include "gve_ethdev.h"
 #include "base/gve_adminq.h"
 
+static inline void
+gve_rx_refill(struct gve_rx_queue *rxq)
+{
+	uint16_t mask = rxq->nb_rx_desc - 1;
+	uint16_t idx = rxq->next_avail & mask;
+	uint32_t next_avail = rxq->next_avail;
+	uint16_t nb_alloc, i;
+	struct rte_mbuf *nmb;
+	int diag;
+
+	/* wrap around */
+	nb_alloc = rxq->nb_rx_desc - idx;
+	if (nb_alloc <= rxq->nb_avail) {
+		diag = rte_pktmbuf_alloc_bulk(rxq->mpool, &rxq->sw_ring[idx], nb_alloc);
+		if (diag < 0) {
+			for (i = 0; i < nb_alloc; i++) {
+				nmb = rte_pktmbuf_alloc(rxq->mpool);
+				if (!nmb)
+					break;
+				rxq->sw_ring[idx + i] = nmb;
+			}
+			if (i != nb_alloc)
+				nb_alloc = i;
+		}
+		rxq->nb_avail -= nb_alloc;
+		next_avail += nb_alloc;
+
+		/* queue page list mode doesn't need real refill. */
+		if (rxq->is_gqi_qpl) {
+			idx += nb_alloc;
+		} else {
+			for (i = 0; i < nb_alloc; i++) {
+				nmb = rxq->sw_ring[idx];
+				rxq->rx_data_ring[idx].addr =
+					rte_cpu_to_be_64(rte_mbuf_data_iova(nmb));
+				idx++;
+			}
+		}
+		if (idx == rxq->nb_rx_desc)
+			idx = 0;
+	}
+
+	if (rxq->nb_avail > 0) {
+		nb_alloc = rxq->nb_avail;
+		if (rxq->nb_rx_desc < idx + rxq->nb_avail)
+			nb_alloc = rxq->nb_rx_desc - idx;
+		diag = rte_pktmbuf_alloc_bulk(rxq->mpool, &rxq->sw_ring[idx], nb_alloc);
+		if (diag < 0) {
+			for (i = 0; i < nb_alloc; i++) {
+				nmb = rte_pktmbuf_alloc(rxq->mpool);
+				if (!nmb)
+					break;
+				rxq->sw_ring[idx + i] = nmb;
+			}
+			nb_alloc = i;
+		}
+		rxq->nb_avail -= nb_alloc;
+		next_avail += nb_alloc;
+
+		if (!rxq->is_gqi_qpl) {
+			for (i = 0; i < nb_alloc; i++) {
+				nmb = rxq->sw_ring[idx];
+				rxq->rx_data_ring[idx].addr =
+					rte_cpu_to_be_64(rte_mbuf_data_iova(nmb));
+				idx++;
+			}
+		}
+	}
+
+	if (next_avail != rxq->next_avail) {
+		rte_write32(rte_cpu_to_be_32(next_avail), rxq->qrx_tail);
+		rxq->next_avail = next_avail;
+	}
+}
+
+uint16_t
+gve_rx_burst(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	volatile struct gve_rx_desc *rxr, *rxd;
+	struct gve_rx_queue *rxq = rx_queue;
+	uint16_t rx_id = rxq->rx_tail;
+	struct rte_mbuf *rxe;
+	uint16_t nb_rx, len;
+	uint64_t addr;
+
+	rxr = rxq->rx_desc_ring;
+
+	for (nb_rx = 0; nb_rx < nb_pkts; nb_rx++) {
+		rxd = &rxr[rx_id];
+		if (GVE_SEQNO(rxd->flags_seq) != rxq->expected_seqno)
+			break;
+
+		if (rxd->flags_seq & GVE_RXF_ERR)
+			continue;
+
+		len = rte_be_to_cpu_16(rxd->len) - GVE_RX_PAD;
+		rxe = rxq->sw_ring[rx_id];
+		rxe->data_off = RTE_PKTMBUF_HEADROOM;
+		if (rxq->is_gqi_qpl) {
+			addr = (uint64_t)(rxq->qpl->mz->addr) + rx_id * PAGE_SIZE + GVE_RX_PAD;
+			rte_memcpy((void *)((size_t)rxe->buf_addr + rxe->data_off),
+				   (void *)(size_t)addr, len);
+		}
+		rxe->nb_segs = 1;
+		rxe->next = NULL;
+		rxe->pkt_len = len;
+		rxe->data_len = len;
+		rxe->port = rxq->port_id;
+		rxe->packet_type = 0;
+		rxe->ol_flags = 0;
+
+		if (rxd->flags_seq & GVE_RXF_TCP)
+			rxe->packet_type |= RTE_PTYPE_L4_TCP;
+		if (rxd->flags_seq & GVE_RXF_UDP)
+			rxe->packet_type |= RTE_PTYPE_L4_UDP;
+		if (rxd->flags_seq & GVE_RXF_IPV4)
+			rxe->packet_type |= RTE_PTYPE_L3_IPV4;
+		if (rxd->flags_seq & GVE_RXF_IPV6)
+			rxe->packet_type |= RTE_PTYPE_L3_IPV6;
+
+		if (gve_needs_rss(rxd->flags_seq)) {
+			rxe->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
+			rxe->hash.rss = rte_be_to_cpu_32(rxd->rss_hash);
+		}
+
+		rxq->expected_seqno = gve_next_seqno(rxq->expected_seqno);
+
+		rx_id++;
+		if (rx_id == rxq->nb_rx_desc)
+			rx_id = 0;
+
+		rx_pkts[nb_rx] = rxe;
+	}
+
+	rxq->nb_avail += nb_rx;
+	rxq->rx_tail = rx_id;
+
+	if (rxq->nb_avail > rxq->free_thresh)
+		gve_rx_refill(rxq);
+
+	return nb_rx;
+}
+
 static inline void
 gve_reset_rxq(struct gve_rx_queue *rxq)
 {
diff --git a/drivers/net/gve/gve_tx.c b/drivers/net/gve/gve_tx.c
index b706b62e71..d94b1186a4 100644
--- a/drivers/net/gve/gve_tx.c
+++ b/drivers/net/gve/gve_tx.c
@@ -5,6 +5,461 @@ 
 #include "gve_ethdev.h"
 #include "base/gve_adminq.h"
 
+static inline void
+gve_free_bulk_mbuf(struct rte_mbuf **txep, int num)
+{
+	struct rte_mbuf *m, *free[GVE_TX_MAX_FREE_SZ];
+	int nb_free = 0;
+	int i, s;
+
+	if (unlikely(num == 0))
+		return;
+
+	/* Find the 1st mbuf which needs to be free */
+	for (s = 0; s < num; s++) {
+		if (txep[s] != NULL) {
+			m = rte_pktmbuf_prefree_seg(txep[s]);
+			if (m != NULL)
+				break;
+			}
+	}
+
+	if (s == num)
+		return;
+
+	free[0] = m;
+	nb_free = 1;
+	for (i = s + 1; i < num; i++) {
+		if (likely(txep[i] != NULL)) {
+			m = rte_pktmbuf_prefree_seg(txep[i]);
+			if (likely(m != NULL)) {
+				if (likely(m->pool == free[0]->pool)) {
+					free[nb_free++] = m;
+				} else {
+					rte_mempool_put_bulk(free[0]->pool, (void *)free, nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+			txep[i] = NULL;
+		}
+	}
+	rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+}
+
+static inline void
+gve_tx_clean(struct gve_tx_queue *txq)
+{
+	uint16_t mask = txq->nb_tx_desc - 1;
+	uint32_t start = txq->next_to_clean & mask;
+	uint32_t ntc, nb_clean, i;
+	struct gve_tx_iovec *iov;
+
+	ntc = rte_be_to_cpu_32(rte_read32(txq->qtx_head));
+	ntc = ntc & mask;
+
+	if (ntc == start)
+		return;
+
+	/* if wrap around, free twice. */
+	if (ntc < start) {
+		nb_clean = txq->nb_tx_desc - start;
+		if (nb_clean > GVE_TX_MAX_FREE_SZ)
+			nb_clean = GVE_TX_MAX_FREE_SZ;
+		if (txq->is_gqi_qpl) {
+			for (i = start; i < start + nb_clean; i++) {
+				iov = &txq->iov_ring[i];
+				txq->fifo_avail += iov->iov_len;
+				iov->iov_base = 0;
+				iov->iov_len = 0;
+			}
+		} else {
+			gve_free_bulk_mbuf(&txq->sw_ring[start], nb_clean);
+		}
+		txq->nb_free += nb_clean;
+		start += nb_clean;
+		if (start == txq->nb_tx_desc)
+			start = 0;
+		txq->next_to_clean += nb_clean;
+	}
+
+	if (ntc > start) {
+		nb_clean = ntc - start;
+		if (nb_clean > GVE_TX_MAX_FREE_SZ)
+			nb_clean = GVE_TX_MAX_FREE_SZ;
+		if (txq->is_gqi_qpl) {
+			for (i = start; i < start + nb_clean; i++) {
+				iov = &txq->iov_ring[i];
+				txq->fifo_avail += iov->iov_len;
+				iov->iov_base = 0;
+				iov->iov_len = 0;
+			}
+		} else {
+			gve_free_bulk_mbuf(&txq->sw_ring[start], nb_clean);
+		}
+		txq->nb_free += nb_clean;
+		txq->next_to_clean += nb_clean;
+	}
+}
+
+static inline void
+gve_tx_clean_swr_qpl(struct gve_tx_queue *txq)
+{
+	uint32_t start = txq->sw_ntc;
+	uint32_t ntc, nb_clean;
+
+	ntc = txq->sw_tail;
+
+	if (ntc == start)
+		return;
+
+	/* if wrap around, free twice. */
+	if (ntc < start) {
+		nb_clean = txq->nb_tx_desc - start;
+		if (nb_clean > GVE_TX_MAX_FREE_SZ)
+			nb_clean = GVE_TX_MAX_FREE_SZ;
+		gve_free_bulk_mbuf(&txq->sw_ring[start], nb_clean);
+
+		txq->sw_nb_free += nb_clean;
+		start += nb_clean;
+		if (start == txq->nb_tx_desc)
+			start = 0;
+		txq->sw_ntc = start;
+	}
+
+	if (ntc > start) {
+		nb_clean = ntc - start;
+		if (nb_clean > GVE_TX_MAX_FREE_SZ)
+			nb_clean = GVE_TX_MAX_FREE_SZ;
+		gve_free_bulk_mbuf(&txq->sw_ring[start], nb_clean);
+		txq->sw_nb_free += nb_clean;
+		start += nb_clean;
+		txq->sw_ntc = start;
+	}
+}
+
+static inline void
+gve_tx_fill_pkt_desc(volatile union gve_tx_desc *desc, struct rte_mbuf *mbuf,
+		     uint8_t desc_cnt, uint16_t len, uint64_t addr)
+{
+	uint64_t csum_l4 = mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK;
+	uint8_t l4_csum_offset = 0;
+	uint8_t l4_hdr_offset = 0;
+
+	if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
+		csum_l4 |= RTE_MBUF_F_TX_TCP_CKSUM;
+
+	switch (csum_l4) {
+	case RTE_MBUF_F_TX_TCP_CKSUM:
+		l4_csum_offset = offsetof(struct rte_tcp_hdr, cksum);
+		l4_hdr_offset = mbuf->l2_len + mbuf->l3_len;
+		break;
+	case RTE_MBUF_F_TX_UDP_CKSUM:
+		l4_csum_offset = offsetof(struct rte_udp_hdr, dgram_cksum);
+		l4_hdr_offset = mbuf->l2_len + mbuf->l3_len;
+		break;
+	case RTE_MBUF_F_TX_SCTP_CKSUM:
+		l4_csum_offset = offsetof(struct rte_sctp_hdr, cksum);
+		l4_hdr_offset = mbuf->l2_len + mbuf->l3_len;
+		break;
+	}
+
+	if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+		desc->pkt.type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
+		desc->pkt.l4_csum_offset = l4_csum_offset >> 1;
+		desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
+	} else if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
+		desc->pkt.type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
+		desc->pkt.l4_csum_offset = l4_csum_offset >> 1;
+		desc->pkt.l4_hdr_offset = l4_hdr_offset >> 1;
+	} else {
+		desc->pkt.type_flags = GVE_TXD_STD;
+		desc->pkt.l4_csum_offset = 0;
+		desc->pkt.l4_hdr_offset = 0;
+	}
+	desc->pkt.desc_cnt = desc_cnt;
+	desc->pkt.len = rte_cpu_to_be_16(mbuf->pkt_len);
+	desc->pkt.seg_len = rte_cpu_to_be_16(len);
+	desc->pkt.seg_addr = rte_cpu_to_be_64(addr);
+}
+
+static inline void
+gve_tx_fill_seg_desc(volatile union gve_tx_desc *desc, uint64_t ol_flags,
+		      union gve_tx_offload tx_offload,
+		      uint16_t len, uint64_t addr)
+{
+	desc->seg.type_flags = GVE_TXD_SEG;
+	if (ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+		if (ol_flags & RTE_MBUF_F_TX_IPV6)
+			desc->seg.type_flags |= GVE_TXSF_IPV6;
+		desc->seg.l3_offset = tx_offload.l2_len >> 1;
+		desc->seg.mss = rte_cpu_to_be_16(tx_offload.tso_segsz);
+	}
+	desc->seg.seg_len = rte_cpu_to_be_16(len);
+	desc->seg.seg_addr = rte_cpu_to_be_64(addr);
+}
+
+static inline bool
+is_fifo_avail(struct gve_tx_queue *txq, uint16_t len)
+{
+	if (txq->fifo_avail < len)
+		return false;
+	/* Don't split segment. */
+	if (txq->fifo_head + len > txq->fifo_size &&
+	    txq->fifo_size - txq->fifo_head + len > txq->fifo_avail)
+		return false;
+	return true;
+}
+static inline uint64_t
+gve_tx_alloc_from_fifo(struct gve_tx_queue *txq, uint16_t tx_id, uint16_t len)
+{
+	uint32_t head = txq->fifo_head;
+	uint32_t size = txq->fifo_size;
+	struct gve_tx_iovec *iov;
+	uint32_t aligned_head;
+	uint32_t iov_len = 0;
+	uint64_t fifo_addr;
+
+	iov = &txq->iov_ring[tx_id];
+
+	/* Don't split segment */
+	if (head + len > size) {
+		iov_len += (size - head);
+		head = 0;
+	}
+
+	fifo_addr = head;
+	iov_len += len;
+	iov->iov_base = head;
+
+	/* Re-align to a cacheline for next head */
+	head += len;
+	aligned_head = RTE_ALIGN(head, RTE_CACHE_LINE_SIZE);
+	iov_len += (aligned_head - head);
+	iov->iov_len = iov_len;
+
+	if (aligned_head == txq->fifo_size)
+		aligned_head = 0;
+	txq->fifo_head = aligned_head;
+	txq->fifo_avail -= iov_len;
+
+	return fifo_addr;
+}
+
+static inline uint16_t
+gve_tx_burst_qpl(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	union gve_tx_offload tx_offload = {0};
+	volatile union gve_tx_desc *txr, *txd;
+	struct gve_tx_queue *txq = tx_queue;
+	struct rte_mbuf **sw_ring = txq->sw_ring;
+	uint16_t mask = txq->nb_tx_desc - 1;
+	uint16_t tx_id = txq->tx_tail & mask;
+	uint64_t ol_flags, addr, fifo_addr;
+	uint32_t tx_tail = txq->tx_tail;
+	struct rte_mbuf *tx_pkt, *first;
+	uint16_t sw_id = txq->sw_tail;
+	uint16_t nb_used, i;
+	uint16_t nb_tx = 0;
+	uint32_t hlen;
+
+	txr = txq->tx_desc_ring;
+
+	if (txq->nb_free < txq->free_thresh || txq->fifo_avail == 0)
+		gve_tx_clean(txq);
+
+	if (txq->sw_nb_free < txq->free_thresh)
+		gve_tx_clean_swr_qpl(txq);
+
+	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
+		tx_pkt = *tx_pkts++;
+		ol_flags = tx_pkt->ol_flags;
+
+		if (txq->sw_nb_free < tx_pkt->nb_segs) {
+			gve_tx_clean_swr_qpl(txq);
+			if (txq->sw_nb_free < tx_pkt->nb_segs)
+				goto end_of_tx;
+		}
+
+		/* Even for multi-segs, use 1 qpl buf for data */
+		nb_used = 1;
+		if (ol_flags & RTE_MBUF_F_TX_TCP_SEG)
+			nb_used++;
+
+		if (txq->nb_free < nb_used)
+			goto end_of_tx;
+
+		tx_offload.l2_len = tx_pkt->l2_len;
+		tx_offload.l3_len = tx_pkt->l3_len;
+		tx_offload.l4_len = tx_pkt->l4_len;
+		tx_offload.tso_segsz = tx_pkt->tso_segsz;
+
+		first = tx_pkt;
+		txd = &txr[tx_id];
+		hlen = ol_flags & RTE_MBUF_F_TX_TCP_SEG ?
+			(uint32_t)(tx_offload.l2_len + tx_offload.l3_len + tx_offload.l4_len) :
+			tx_pkt->pkt_len;
+
+		sw_ring[sw_id] = tx_pkt;
+		if (!is_fifo_avail(txq, hlen)) {
+			gve_tx_clean(txq);
+			if (!is_fifo_avail(txq, hlen))
+				goto end_of_tx;
+		}
+		addr = (uint64_t)(tx_pkt->buf_addr) + tx_pkt->data_off;
+		fifo_addr = gve_tx_alloc_from_fifo(txq, tx_id, hlen);
+
+		/* For TSO, check if there's enough fifo space for data first */
+		if (ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+			if (!is_fifo_avail(txq, tx_pkt->pkt_len - hlen)) {
+				gve_tx_clean(txq);
+				if (!is_fifo_avail(txq, tx_pkt->pkt_len - hlen))
+					goto end_of_tx;
+			}
+		}
+		if (tx_pkt->nb_segs == 1 || ol_flags & RTE_MBUF_F_TX_TCP_SEG)
+			rte_memcpy((void *)(size_t)(fifo_addr + txq->fifo_base),
+				   (void *)(size_t)addr, hlen);
+		else
+			rte_pktmbuf_read(tx_pkt, 0, hlen,
+					 (void *)(size_t)(fifo_addr + txq->fifo_base));
+		gve_tx_fill_pkt_desc(txd, tx_pkt, nb_used, hlen, fifo_addr);
+
+		if (ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+			tx_id = (tx_id + 1) & mask;
+			txd = &txr[tx_id];
+			addr = (uint64_t)(tx_pkt->buf_addr) + tx_pkt->data_off + hlen;
+			fifo_addr = gve_tx_alloc_from_fifo(txq, tx_id, tx_pkt->pkt_len - hlen);
+			if (tx_pkt->nb_segs == 1)
+				rte_memcpy((void *)(size_t)(fifo_addr + txq->fifo_base),
+					   (void *)(size_t)addr,
+					   tx_pkt->pkt_len - hlen);
+			else
+				rte_pktmbuf_read(tx_pkt, hlen, tx_pkt->pkt_len - hlen,
+						 (void *)(size_t)(fifo_addr + txq->fifo_base));
+
+			gve_tx_fill_seg_desc(txd, ol_flags, tx_offload,
+					     tx_pkt->pkt_len - hlen, fifo_addr);
+		}
+
+		/* record mbuf in sw_ring for free */
+		for (i = 1; i < first->nb_segs; i++) {
+			sw_id = (sw_id + 1) & mask;
+			tx_pkt = tx_pkt->next;
+			sw_ring[sw_id] = tx_pkt;
+		}
+
+		sw_id = (sw_id + 1) & mask;
+		tx_id = (tx_id + 1) & mask;
+
+		txq->nb_free -= nb_used;
+		txq->sw_nb_free -= first->nb_segs;
+		tx_tail += nb_used;
+	}
+
+end_of_tx:
+	if (nb_tx) {
+		rte_write32(rte_cpu_to_be_32(tx_tail), txq->qtx_tail);
+		txq->tx_tail = tx_tail;
+		txq->sw_tail = sw_id;
+	}
+
+	return nb_tx;
+}
+
+static inline uint16_t
+gve_tx_burst_ra(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	union gve_tx_offload tx_offload = {0};
+	volatile union gve_tx_desc *txr, *txd;
+	struct gve_tx_queue *txq = tx_queue;
+	struct rte_mbuf **sw_ring = txq->sw_ring;
+	uint16_t mask = txq->nb_tx_desc - 1;
+	uint16_t tx_id = txq->tx_tail & mask;
+	uint32_t tx_tail = txq->tx_tail;
+	struct rte_mbuf *tx_pkt, *first;
+	uint16_t nb_used, hlen, i;
+	uint64_t ol_flags, addr;
+	uint16_t nb_tx = 0;
+
+	txr = txq->tx_desc_ring;
+
+	if (txq->nb_free < txq->free_thresh)
+		gve_tx_clean(txq);
+
+	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
+		tx_pkt = *tx_pkts++;
+		ol_flags = tx_pkt->ol_flags;
+
+		nb_used = tx_pkt->nb_segs;
+		if (ol_flags & RTE_MBUF_F_TX_TCP_SEG)
+			nb_used++;
+
+		if (txq->nb_free < nb_used)
+			goto end_of_tx;
+
+		tx_offload.l2_len = tx_pkt->l2_len;
+		tx_offload.l3_len = tx_pkt->l3_len;
+		tx_offload.l4_len = tx_pkt->l4_len;
+		tx_offload.tso_segsz = tx_pkt->tso_segsz;
+
+		first = tx_pkt;
+		txd = &txr[tx_id];
+
+		hlen = ol_flags & RTE_MBUF_F_TX_TCP_SEG ?
+			(uint32_t)(tx_offload.l2_len + tx_offload.l3_len + tx_offload.l4_len) :
+			tx_pkt->pkt_len;
+		/*
+		 * if tso, the driver needs to fill 2 descs for 1 mbuf
+		 * so only put this mbuf into the 1st tx entry in sw ring
+		 */
+		sw_ring[tx_id] = tx_pkt;
+		addr = rte_mbuf_data_iova(tx_pkt);
+		gve_tx_fill_pkt_desc(txd, tx_pkt, nb_used, hlen, addr);
+
+		if (ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+			tx_id = (tx_id + 1) & mask;
+			txd = &txr[tx_id];
+			addr = rte_mbuf_data_iova(tx_pkt) + hlen;
+			gve_tx_fill_seg_desc(txd, ol_flags, tx_offload,
+					     tx_pkt->data_len - hlen, addr);
+		}
+
+		for (i = 1; i < first->nb_segs; i++) {
+			tx_id = (tx_id + 1) & mask;
+			txd = &txr[tx_id];
+			tx_pkt = tx_pkt->next;
+			sw_ring[tx_id] = tx_pkt;
+			addr = rte_mbuf_data_iova(tx_pkt);
+			gve_tx_fill_seg_desc(txd, ol_flags, tx_offload,
+					     tx_pkt->data_len, addr);
+		}
+		tx_id = (tx_id + 1) & mask;
+
+		txq->nb_free -= nb_used;
+		tx_tail += nb_used;
+	}
+
+end_of_tx:
+	if (nb_tx) {
+		rte_write32(rte_cpu_to_be_32(tx_tail), txq->qtx_tail);
+		txq->tx_tail = tx_tail;
+	}
+
+	return nb_tx;
+}
+
+uint16_t
+gve_tx_burst(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct gve_tx_queue *txq = tx_queue;
+
+	if (txq->is_gqi_qpl)
+		return gve_tx_burst_qpl(tx_queue, tx_pkts, nb_pkts);
+
+	return gve_tx_burst_ra(tx_queue, tx_pkts, nb_pkts);
+}
+
 static inline void
 gve_reset_txq(struct gve_tx_queue *txq)
 {