[v4,2/2] net/tap: support TSO (TCP Segment Offload)
Checks
Commit Message
This commit implements TCP segmentation offload in TAP.
librte_gso library is used to segment large TCP payloads (e.g. packets
of 64K bytes size) into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.
For more details on librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TCP TSO segments is limited to 64.
Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
drivers/net/tap/Makefile | 2 +-
drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++-------
drivers/net/tap/rte_eth_tap.h | 3 +
mk/rte.app.mk | 4 +-
4 files changed, 138 insertions(+), 30 deletions(-)
Comments
> On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophirmu@mellanox.com> wrote:
>
> This commit implements TCP segmentation offload in TAP.
> librte_gso library is used to segment large TCP payloads (e.g. packets
> of 64K bytes size) into smaller MTU size buffers.
> By supporting TSO offload capability in software a TAP device can be used
> as a failsafe sub device and be paired with another PCI device which
> supports TSO capability in HW.
>
> For more details on librte_gso implementation please refer to dpdk
> documentation.
> The number of newly generated TCP TSO segments is limited to 64.
>
> Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> ---
> drivers/net/tap/Makefile | 2 +-
> drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++-------
> drivers/net/tap/rte_eth_tap.h | 3 +
> mk/rte.app.mk | 4 +-
> 4 files changed, 138 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> index ccc5c5f..3243365 100644
> --- a/drivers/net/tap/Makefile
> +++ b/drivers/net/tap/Makefile
> @@ -24,7 +24,7 @@ CFLAGS += -I.
> CFLAGS += $(WERROR_FLAGS)
> LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
> -LDLIBS += -lrte_bus_vdev
> +LDLIBS += -lrte_bus_vdev -lrte_gso
>
> CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
>
> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> index c19f053..62b931f 100644
> --- a/drivers/net/tap/rte_eth_tap.c
> +++ b/drivers/net/tap/rte_eth_tap.c
> @@ -17,6 +17,7 @@
> #include <rte_ip.h>
> #include <rte_string_fns.h>
>
> +#include <assert.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/socket.h>
> @@ -55,6 +56,9 @@
> #define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
> #define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
>
> +#define TAP_GSO_MBUFS_NUM 128
> +#define TAP_GSO_MBUF_SEG_SIZE 128
> +
> static struct rte_vdev_driver pmd_tap_drv;
> static struct rte_vdev_driver pmd_tun_drv;
>
> @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
> return DEV_TX_OFFLOAD_MULTI_SEGS |
> DEV_TX_OFFLOAD_IPV4_CKSUM |
> DEV_TX_OFFLOAD_UDP_CKSUM |
> - DEV_TX_OFFLOAD_TCP_CKSUM;
> + DEV_TX_OFFLOAD_TCP_CKSUM |
> + DEV_TX_OFFLOAD_TCP_TSO;
> }
>
> /* Finalize l4 checksum calculation */
> @@ -479,23 +484,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
> }
> }
>
> -/* Callback to handle sending packets from the tap interface
> - */
> -static uint16_t
> -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +static inline void
> +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
> + struct rte_mbuf **pmbufs, uint16_t l234_hlen,
> + uint16_t *num_packets, unsigned long *num_tx_bytes)
> {
> - struct tx_queue *txq = queue;
> - uint16_t num_tx = 0;
> - unsigned long num_tx_bytes = 0;
> - uint32_t max_size;
> int i;
>
> - if (unlikely(nb_pkts == 0))
> - return 0;
> -
> - max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> - for (i = 0; i < nb_pkts; i++) {
> - struct rte_mbuf *mbuf = bufs[num_tx];
> + for (i = 0; i < num_mbufs; i++) {
> + struct rte_mbuf *mbuf = pmbufs[i];
> struct iovec iovecs[mbuf->nb_segs + 2];
> struct tun_pi pi = { .flags = 0, .proto = 0x00 };
> struct rte_mbuf *seg = mbuf;
> @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> int proto;
> int n;
> int j;
> - int k; /* first index in iovecs for copying segments */
> - uint16_t l234_hlen; /* length of layers 2,3,4 headers */
> + int k; /* current index in iovecs for copying segments */
> uint16_t seg_len; /* length of first segment */
> uint16_t nb_segs;
> uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
> @@ -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
> uint16_t is_cksum = 0; /* in case cksum should be offloaded */
>
> - /* stats.errs will be incremented */
> - if (rte_pktmbuf_pkt_len(mbuf) > max_size)
> - break;
> -
> l4_cksum = NULL;
> if (txq->type == ETH_TUNTAP_TYPE_TUN) {
> /*
> @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> if (seg_len < l234_hlen)
> break;
> -
> - /* To change checksums, work on a
> - * copy of l2, l3 l4 headers.
> + /* To change checksums, work on a * copy of l2, l3
> + * headers + l4 pseudo header
> */
> rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
> l234_hlen);
> @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> n = writev(txq->fd, iovecs, j);
> if (n <= 0)
> break;
> + (*num_packets)++;
> + (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
> + }
> +}
> +
> +/* Callback to handle sending packets from the tap interface
> + */
> +static uint16_t
> +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +{
> + struct tx_queue *txq = queue;
> + uint16_t num_tx = 0;
> + uint16_t num_packets = 0;
> + unsigned long num_tx_bytes = 0;
> + uint16_t tso_segsz = 0;
> + uint16_t hdrs_len;
> + uint32_t max_size;
> + int i;
> + uint64_t tso;
> + int ret;
>
> + if (unlikely(nb_pkts == 0))
> + return 0;
> +
> + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> + max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> + for (i = 0; i < nb_pkts; i++) {
> + struct rte_mbuf *mbuf_in = bufs[num_tx];
> + struct rte_mbuf **mbuf;
> + uint16_t num_mbufs;
> +
> + tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
> + if (tso) {
> + struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
Blank line here after declare.
> + assert(gso_ctx != NULL);
> + /* TCP segmentation implies TCP checksum offload */
> + mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
> + /* gso size is calculated without ETHER_CRC_LEN */
> + hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
> + mbuf_in->l4_len;
> + tso_segsz = mbuf_in->tso_segsz + hdrs_len;
> + if (unlikely(tso_segsz == hdrs_len) ||
> + tso_segsz > *txq->mtu) {
> + txq->stats.errs++;
> + break;
> + }
> + gso_ctx->gso_size = tso_segsz;
> + ret = rte_gso_segment(mbuf_in, /* packet to segment */
> + gso_ctx, /* gso control block */
> + (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
> + RTE_DIM(gso_mbufs)); /* max tso mbufs */
> +
> + /* ret contains the number of new created mbufs */
> + if (ret < 0)
> + break;
> +
> + mbuf = gso_mbufs;
> + num_mbufs = ret;
> + } else {
> + /* stats.errs will be incremented */
> + if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
> + break;
> +
> + mbuf = &mbuf_in;
> + num_mbufs = 1;
> + }
> +
> + tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
> + &num_packets, &num_tx_bytes);
> num_tx++;
> - num_tx_bytes += mbuf->pkt_len;
> - rte_pktmbuf_free(mbuf);
> + rte_pktmbuf_free(mbuf_in);
> }
>
> - txq->stats.opackets += num_tx;
> + txq->stats.opackets += num_packets;
> txq->stats.errs += nb_pkts - num_tx;
> txq->stats.obytes += num_tx_bytes;
>
> @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
> }
>
> static int
> +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
> +{
> + uint32_t gso_types;
> + char pool_name[64];
> +
> + /*
> + * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
> + * size per mbuf use this pool for both direct and indirect mbufs
> + */
> +
> + struct rte_mempool *mp; /* Mempool for GSO packets */
Blank line after declare.
> + /* initialize GSO context */
> + gso_types = DEV_TX_OFFLOAD_TCP_TSO;
> + snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
> + mp = rte_mempool_lookup((const char *)pool_name);
> + if (!mp) {
> + mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
> + 0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
> + SOCKET_ID_ANY);
> + if (!mp) {
> + struct pmd_internals *pmd = dev->data->dev_private;
> + RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
> + pmd->name, dev->device->name);
> + return -1;
> + }
> + }
> +
> + gso_ctx->direct_pool = mp;
> + gso_ctx->indirect_pool = mp;
> + gso_ctx->gso_types = gso_types;
> + gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
> + gso_ctx->flag = 0;
> +
> + return 0;
> +}
> +
> +static int
> tap_setup_queue(struct rte_eth_dev *dev,
> struct pmd_internals *internals,
> uint16_t qid,
> int is_rx)
> {
> + int ret;
> int *fd;
> int *other_fd;
> const char *dir;
> struct pmd_internals *pmd = dev->data->dev_private;
> struct rx_queue *rx = &internals->rxq[qid];
> struct tx_queue *tx = &internals->txq[qid];
> + struct rte_gso_ctx *gso_ctx;
>
> if (is_rx) {
> fd = &rx->fd;
> other_fd = &tx->fd;
> dir = "rx";
> + gso_ctx = NULL;
> } else {
> fd = &tx->fd;
> other_fd = &rx->fd;
> dir = "tx";
> + gso_ctx = &tx->gso_ctx;
> }
> if (*fd != -1) {
> /* fd for this queue already exists */
> TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
> pmd->name, *fd, dir, qid);
> + gso_ctx = NULL;
> } else if (*other_fd != -1) {
> /* Only other_fd exists. dup it */
> *fd = dup(*other_fd);
> @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
>
> tx->mtu = &dev->data->mtu;
> rx->rxmode = &dev->data->dev_conf.rxmode;
> + if (gso_ctx) {
> + ret = tap_gso_ctx_setup(gso_ctx, dev);
> + if (ret)
> + return -1;
> + }
>
> tx->type = pmd->type;
>
> diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
> index 7b21d0d..44e2773 100644
> --- a/drivers/net/tap/rte_eth_tap.h
> +++ b/drivers/net/tap/rte_eth_tap.h
> @@ -15,6 +15,7 @@
>
> #include <rte_ethdev_driver.h>
> #include <rte_ether.h>
> +#include <rte_gso.h>
> #include "tap_log.h"
>
> #ifdef IFF_MULTI_QUEUE
> @@ -22,6 +23,7 @@
> #else
> #define RTE_PMD_TAP_MAX_QUEUES 1
> #endif
> +#define MAX_GSO_MBUFS 64
>
> enum rte_tuntap_type {
> ETH_TUNTAP_TYPE_UNKNOWN,
> @@ -59,6 +61,7 @@ struct tx_queue {
> uint16_t *mtu; /* Pointer to MTU from dev_data */
> uint16_t csum:1; /* Enable checksum offloading */
> struct pkt_stats stats; /* Stats for this TX queue */
> + struct rte_gso_ctx gso_ctx; /* GSO context */
> };
>
> struct pmd_internals {
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 1e32c83..e2ee879 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) += -lrte_port
> _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
> _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
> _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
> _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
> # librte_acl needs --whole-archive because of weak functions
> @@ -61,6 +59,8 @@ endif
> _LDLIBS-y += --whole-archive
>
> _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) += -lrte_cfgfile
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) += -lrte_hash
> _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) += -lrte_member
> _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lrte_vhost
> --
> 2.7.4
>
Regards,
Keith
> On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophirmu@mellanox.com> wrote:
>
> This commit implements TCP segmentation offload in TAP.
> librte_gso library is used to segment large TCP payloads (e.g. packets
> of 64K bytes size) into smaller MTU size buffers.
> By supporting TSO offload capability in software a TAP device can be used
> as a failsafe sub device and be paired with another PCI device which
> supports TSO capability in HW.
>
> For more details on librte_gso implementation please refer to dpdk
> documentation.
> The number of newly generated TCP TSO segments is limited to 64.
>
> Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> ---
> drivers/net/tap/Makefile | 2 +-
> drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++-------
> drivers/net/tap/rte_eth_tap.h | 3 +
> mk/rte.app.mk | 4 +-
> 4 files changed, 138 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> index ccc5c5f..3243365 100644
> --- a/drivers/net/tap/Makefile
> +++ b/drivers/net/tap/Makefile
> @@ -24,7 +24,7 @@ CFLAGS += -I.
> CFLAGS += $(WERROR_FLAGS)
> LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
> -LDLIBS += -lrte_bus_vdev
> +LDLIBS += -lrte_bus_vdev -lrte_gso
>
> CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
>
> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> index c19f053..62b931f 100644
> --- a/drivers/net/tap/rte_eth_tap.c
> +++ b/drivers/net/tap/rte_eth_tap.c
> @@ -17,6 +17,7 @@
> #include <rte_ip.h>
> #include <rte_string_fns.h>
>
> +#include <assert.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/socket.h>
> @@ -55,6 +56,9 @@
> #define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
> #define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
>
> +#define TAP_GSO_MBUFS_NUM 128
> +#define TAP_GSO_MBUF_SEG_SIZE 128
> +
> static struct rte_vdev_driver pmd_tap_drv;
> static struct rte_vdev_driver pmd_tun_drv;
>
> @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
> return DEV_TX_OFFLOAD_MULTI_SEGS |
> DEV_TX_OFFLOAD_IPV4_CKSUM |
> DEV_TX_OFFLOAD_UDP_CKSUM |
> - DEV_TX_OFFLOAD_TCP_CKSUM;
> + DEV_TX_OFFLOAD_TCP_CKSUM |
> + DEV_TX_OFFLOAD_TCP_TSO;
> }
>
> /* Finalize l4 checksum calculation */
> @@ -479,23 +484,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
> }
> }
>
> -/* Callback to handle sending packets from the tap interface
> - */
> -static uint16_t
> -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +static inline void
> +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
> + struct rte_mbuf **pmbufs, uint16_t l234_hlen,
> + uint16_t *num_packets, unsigned long *num_tx_bytes)
> {
> - struct tx_queue *txq = queue;
> - uint16_t num_tx = 0;
> - unsigned long num_tx_bytes = 0;
> - uint32_t max_size;
> int i;
>
> - if (unlikely(nb_pkts == 0))
> - return 0;
> -
> - max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> - for (i = 0; i < nb_pkts; i++) {
> - struct rte_mbuf *mbuf = bufs[num_tx];
> + for (i = 0; i < num_mbufs; i++) {
> + struct rte_mbuf *mbuf = pmbufs[i];
> struct iovec iovecs[mbuf->nb_segs + 2];
> struct tun_pi pi = { .flags = 0, .proto = 0x00 };
> struct rte_mbuf *seg = mbuf;
> @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> int proto;
> int n;
> int j;
> - int k; /* first index in iovecs for copying segments */
> - uint16_t l234_hlen; /* length of layers 2,3,4 headers */
> + int k; /* current index in iovecs for copying segments */
> uint16_t seg_len; /* length of first segment */
> uint16_t nb_segs;
> uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
> @@ -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
> uint16_t is_cksum = 0; /* in case cksum should be offloaded */
>
> - /* stats.errs will be incremented */
> - if (rte_pktmbuf_pkt_len(mbuf) > max_size)
> - break;
> -
> l4_cksum = NULL;
> if (txq->type == ETH_TUNTAP_TYPE_TUN) {
> /*
> @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> if (seg_len < l234_hlen)
> break;
> -
> - /* To change checksums, work on a
> - * copy of l2, l3 l4 headers.
Adding back in the blank line above would be nice for readability.
> + /* To change checksums, work on a * copy of l2, l3
> + * headers + l4 pseudo header
> */
> rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
> l234_hlen);
> @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> n = writev(txq->fd, iovecs, j);
> if (n <= 0)
> break;
> + (*num_packets)++;
> + (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
> + }
> +}
> +
> +/* Callback to handle sending packets from the tap interface
> + */
> +static uint16_t
> +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +{
> + struct tx_queue *txq = queue;
> + uint16_t num_tx = 0;
> + uint16_t num_packets = 0;
> + unsigned long num_tx_bytes = 0;
> + uint16_t tso_segsz = 0;
> + uint16_t hdrs_len;
> + uint32_t max_size;
> + int i;
> + uint64_t tso;
> + int ret;
>
> + if (unlikely(nb_pkts == 0))
> + return 0;
> +
> + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> + max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> + for (i = 0; i < nb_pkts; i++) {
> + struct rte_mbuf *mbuf_in = bufs[num_tx];
> + struct rte_mbuf **mbuf;
> + uint16_t num_mbufs;
> +
> + tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
> + if (tso) {
> + struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
Missing blank line here, this one is not optional.
> + assert(gso_ctx != NULL);
Blank line would be nice.
> + /* TCP segmentation implies TCP checksum offload */
> + mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
Blank line would be nice.
> + /* gso size is calculated without ETHER_CRC_LEN */
> + hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
> + mbuf_in->l4_len;
> + tso_segsz = mbuf_in->tso_segsz + hdrs_len;
> + if (unlikely(tso_segsz == hdrs_len) ||
> + tso_segsz > *txq->mtu) {
> + txq->stats.errs++;
> + break;
> + }
> + gso_ctx->gso_size = tso_segsz;
> + ret = rte_gso_segment(mbuf_in, /* packet to segment */
> + gso_ctx, /* gso control block */
> + (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
> + RTE_DIM(gso_mbufs)); /* max tso mbufs */
> +
> + /* ret contains the number of new created mbufs */
> + if (ret < 0)
> + break;
> +
> + mbuf = gso_mbufs;
> + num_mbufs = ret;
> + } else {
> + /* stats.errs will be incremented */
> + if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
> + break;
> +
> + mbuf = &mbuf_in;
> + num_mbufs = 1;
> + }
> +
> + tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
> + &num_packets, &num_tx_bytes);
> num_tx++;
> - num_tx_bytes += mbuf->pkt_len;
> - rte_pktmbuf_free(mbuf);
> + rte_pktmbuf_free(mbuf_in);
> }
>
> - txq->stats.opackets += num_tx;
> + txq->stats.opackets += num_packets;
> txq->stats.errs += nb_pkts - num_tx;
> txq->stats.obytes += num_tx_bytes;
>
> @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
> }
>
> static int
> +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
> +{
> + uint32_t gso_types;
> + char pool_name[64];
> +
> + /*
> + * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
> + * size per mbuf use this pool for both direct and indirect mbufs
> + */
> +
> + struct rte_mempool *mp; /* Mempool for GSO packets */
> + /* initialize GSO context */
> + gso_types = DEV_TX_OFFLOAD_TCP_TSO;
> + snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
> + mp = rte_mempool_lookup((const char *)pool_name);
> + if (!mp) {
> + mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
> + 0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
> + SOCKET_ID_ANY);
You have setup the mempool with no cache size, which means you have to take a lock for each allocate. This could changed to have a small cache per lcore say 8, but the total number of mbufs needs to be large enough to not allow starvation for a lcore. total_mbufs = (max_num_ports * cache_size) + some_extra mbufs;
> + if (!mp) {
> + struct pmd_internals *pmd = dev->data->dev_private;
> + RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
> + pmd->name, dev->device->name);
> + return -1;
> + }
> + }
> +
> + gso_ctx->direct_pool = mp;
> + gso_ctx->indirect_pool = mp;
> + gso_ctx->gso_types = gso_types;
> + gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
> + gso_ctx->flag = 0;
> +
> + return 0;
> +}
> +
> +static int
> tap_setup_queue(struct rte_eth_dev *dev,
> struct pmd_internals *internals,
> uint16_t qid,
> int is_rx)
> {
> + int ret;
> int *fd;
> int *other_fd;
> const char *dir;
> struct pmd_internals *pmd = dev->data->dev_private;
> struct rx_queue *rx = &internals->rxq[qid];
> struct tx_queue *tx = &internals->txq[qid];
> + struct rte_gso_ctx *gso_ctx;
>
> if (is_rx) {
> fd = &rx->fd;
> other_fd = &tx->fd;
> dir = "rx";
> + gso_ctx = NULL;
> } else {
> fd = &tx->fd;
> other_fd = &rx->fd;
> dir = "tx";
> + gso_ctx = &tx->gso_ctx;
> }
> if (*fd != -1) {
> /* fd for this queue already exists */
> TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
> pmd->name, *fd, dir, qid);
> + gso_ctx = NULL;
> } else if (*other_fd != -1) {
> /* Only other_fd exists. dup it */
> *fd = dup(*other_fd);
> @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
>
> tx->mtu = &dev->data->mtu;
> rx->rxmode = &dev->data->dev_conf.rxmode;
> + if (gso_ctx) {
> + ret = tap_gso_ctx_setup(gso_ctx, dev);
> + if (ret)
> + return -1;
> + }
>
> tx->type = pmd->type;
>
> diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
> index 7b21d0d..44e2773 100644
> --- a/drivers/net/tap/rte_eth_tap.h
> +++ b/drivers/net/tap/rte_eth_tap.h
> @@ -15,6 +15,7 @@
>
> #include <rte_ethdev_driver.h>
> #include <rte_ether.h>
> +#include <rte_gso.h>
> #include "tap_log.h"
>
> #ifdef IFF_MULTI_QUEUE
> @@ -22,6 +23,7 @@
> #else
> #define RTE_PMD_TAP_MAX_QUEUES 1
> #endif
> +#define MAX_GSO_MBUFS 64
>
> enum rte_tuntap_type {
> ETH_TUNTAP_TYPE_UNKNOWN,
> @@ -59,6 +61,7 @@ struct tx_queue {
> uint16_t *mtu; /* Pointer to MTU from dev_data */
> uint16_t csum:1; /* Enable checksum offloading */
> struct pkt_stats stats; /* Stats for this TX queue */
> + struct rte_gso_ctx gso_ctx; /* GSO context */
> };
>
> struct pmd_internals {
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 1e32c83..e2ee879 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) += -lrte_port
> _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
> _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
> _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
> _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
> # librte_acl needs --whole-archive because of weak functions
> @@ -61,6 +59,8 @@ endif
> _LDLIBS-y += --whole-archive
>
> _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) += -lrte_cfgfile
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) += -lrte_hash
> _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) += -lrte_member
> _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lrte_vhost
> --
> 2.7.4
>
Regards,
Keith
> -----Original Message-----
> From: Wiles, Keith [mailto:keith.wiles@intel.com]
> Sent: Wednesday, June 13, 2018 7:04 PM
> To: Ophir Munk <ophirmu@mellanox.com>
> Cc: DPDK <dev@dpdk.org>; Pascal Mazon <pascal.mazon@6wind.com>;
> Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> <olgas@mellanox.com>
> Subject: Re: [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload)
>
>
>
> > On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophirmu@mellanox.com>
> wrote:
> >
> > This commit implements TCP segmentation offload in TAP.
> > librte_gso library is used to segment large TCP payloads (e.g. packets
> > of 64K bytes size) into smaller MTU size buffers.
> > By supporting TSO offload capability in software a TAP device can be
> > used as a failsafe sub device and be paired with another PCI device
> > which supports TSO capability in HW.
> >
> > For more details on librte_gso implementation please refer to dpdk
> > documentation.
> > The number of newly generated TCP TSO segments is limited to 64.
> >
> > Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > ---
> > drivers/net/tap/Makefile | 2 +-
> > drivers/net/tap/rte_eth_tap.c | 159
> +++++++++++++++++++++++++++++++++++-------
> > drivers/net/tap/rte_eth_tap.h | 3 +
> > mk/rte.app.mk | 4 +-
> > 4 files changed, 138 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile index
> > ccc5c5f..3243365 100644
> > --- a/drivers/net/tap/Makefile
> > +++ b/drivers/net/tap/Makefile
> > @@ -24,7 +24,7 @@ CFLAGS += -I.
> > CFLAGS += $(WERROR_FLAGS)
> > LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS +=
> > -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash -LDLIBS +=
> > -lrte_bus_vdev
> > +LDLIBS += -lrte_bus_vdev -lrte_gso
> >
> > CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
> >
> > diff --git a/drivers/net/tap/rte_eth_tap.c
> > b/drivers/net/tap/rte_eth_tap.c index c19f053..62b931f 100644
> > --- a/drivers/net/tap/rte_eth_tap.c
> > +++ b/drivers/net/tap/rte_eth_tap.c
> > @@ -17,6 +17,7 @@
> > #include <rte_ip.h>
> > #include <rte_string_fns.h>
> >
> > +#include <assert.h>
> > #include <sys/types.h>
> > #include <sys/stat.h>
> > #include <sys/socket.h>
> > @@ -55,6 +56,9 @@
> > #define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
> > #define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|"
> ETH_TAP_USR_MAC_FMT
> >
> > +#define TAP_GSO_MBUFS_NUM 128
> > +#define TAP_GSO_MBUF_SEG_SIZE 128
> > +
> > static struct rte_vdev_driver pmd_tap_drv; static struct
> > rte_vdev_driver pmd_tun_drv;
> >
> > @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
> > return DEV_TX_OFFLOAD_MULTI_SEGS |
> > DEV_TX_OFFLOAD_IPV4_CKSUM |
> > DEV_TX_OFFLOAD_UDP_CKSUM |
> > - DEV_TX_OFFLOAD_TCP_CKSUM;
> > + DEV_TX_OFFLOAD_TCP_CKSUM |
> > + DEV_TX_OFFLOAD_TCP_TSO;
> > }
> >
> > /* Finalize l4 checksum calculation */ @@ -479,23 +484,15 @@
> > tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
> > }
> > }
> >
> > -/* Callback to handle sending packets from the tap interface
> > - */
> > -static uint16_t
> > -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> > +static inline void
> > +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
> > + struct rte_mbuf **pmbufs, uint16_t l234_hlen,
> > + uint16_t *num_packets, unsigned long
> *num_tx_bytes)
> > {
> > - struct tx_queue *txq = queue;
> > - uint16_t num_tx = 0;
> > - unsigned long num_tx_bytes = 0;
> > - uint32_t max_size;
> > int i;
> >
> > - if (unlikely(nb_pkts == 0))
> > - return 0;
> > -
> > - max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> > - for (i = 0; i < nb_pkts; i++) {
> > - struct rte_mbuf *mbuf = bufs[num_tx];
> > + for (i = 0; i < num_mbufs; i++) {
> > + struct rte_mbuf *mbuf = pmbufs[i];
> > struct iovec iovecs[mbuf->nb_segs + 2];
> > struct tun_pi pi = { .flags = 0, .proto = 0x00 };
> > struct rte_mbuf *seg = mbuf;
> > @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> > int proto;
> > int n;
> > int j;
> > - int k; /* first index in iovecs for copying segments */
> > - uint16_t l234_hlen; /* length of layers 2,3,4 headers */
> > + int k; /* current index in iovecs for copying segments */
> > uint16_t seg_len; /* length of first segment */
> > uint16_t nb_segs;
> > uint16_t *l4_cksum; /* l4 checksum (pseudo header +
> payload) */ @@
> > -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
> > uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header
> checksum */
> > uint16_t is_cksum = 0; /* in case cksum should be offloaded
> */
> >
> > - /* stats.errs will be incremented */
> > - if (rte_pktmbuf_pkt_len(mbuf) > max_size)
> > - break;
> > -
> > l4_cksum = NULL;
> > if (txq->type == ETH_TUNTAP_TYPE_TUN) {
> > /*
> > @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> > l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf-
> >l4_len;
> > if (seg_len < l234_hlen)
> > break;
> > -
> > - /* To change checksums, work on a
> > - * copy of l2, l3 l4 headers.
>
> Adding back in the blank line above would be nice for readability.
> > + /* To change checksums, work on a * copy of l2, l3
> > + * headers + l4 pseudo header
> > */
> > rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void
> *),
> > l234_hlen);
> > @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> > n = writev(txq->fd, iovecs, j);
> > if (n <= 0)
> > break;
> > + (*num_packets)++;
> > + (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
> > + }
> > +}
> > +
> > +/* Callback to handle sending packets from the tap interface */
> > +static uint16_t pmd_tx_burst(void *queue, struct rte_mbuf **bufs,
> > +uint16_t nb_pkts) {
> > + struct tx_queue *txq = queue;
> > + uint16_t num_tx = 0;
> > + uint16_t num_packets = 0;
> > + unsigned long num_tx_bytes = 0;
> > + uint16_t tso_segsz = 0;
> > + uint16_t hdrs_len;
> > + uint32_t max_size;
> > + int i;
> > + uint64_t tso;
> > + int ret;
> >
> > + if (unlikely(nb_pkts == 0))
> > + return 0;
> > +
> > + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> > + max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> > + for (i = 0; i < nb_pkts; i++) {
> > + struct rte_mbuf *mbuf_in = bufs[num_tx];
> > + struct rte_mbuf **mbuf;
> > + uint16_t num_mbufs;
> > +
> > + tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
> > + if (tso) {
> > + struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
>
> Missing blank line here, this one is not optional.
> > + assert(gso_ctx != NULL);
>
> Blank line would be nice.
> > + /* TCP segmentation implies TCP checksum offload
> */
> > + mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
>
> Blank line would be nice.
> > + /* gso size is calculated without ETHER_CRC_LEN */
> > + hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
> > + mbuf_in->l4_len;
> > + tso_segsz = mbuf_in->tso_segsz + hdrs_len;
> > + if (unlikely(tso_segsz == hdrs_len) ||
> > + tso_segsz > *txq->mtu) {
> > + txq->stats.errs++;
> > + break;
> > + }
> > + gso_ctx->gso_size = tso_segsz;
> > + ret = rte_gso_segment(mbuf_in, /* packet to
> segment */
> > + gso_ctx, /* gso control block */
> > + (struct rte_mbuf **)&gso_mbufs, /* out
> mbufs */
> > + RTE_DIM(gso_mbufs)); /* max tso mbufs */
> > +
> > + /* ret contains the number of new created mbufs */
> > + if (ret < 0)
> > + break;
> > +
> > + mbuf = gso_mbufs;
> > + num_mbufs = ret;
> > + } else {
> > + /* stats.errs will be incremented */
> > + if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
> > + break;
> > +
> > + mbuf = &mbuf_in;
> > + num_mbufs = 1;
> > + }
> > +
> > + tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
> > + &num_packets, &num_tx_bytes);
> > num_tx++;
> > - num_tx_bytes += mbuf->pkt_len;
> > - rte_pktmbuf_free(mbuf);
> > + rte_pktmbuf_free(mbuf_in);
> > }
> >
> > - txq->stats.opackets += num_tx;
> > + txq->stats.opackets += num_packets;
> > txq->stats.errs += nb_pkts - num_tx;
> > txq->stats.obytes += num_tx_bytes;
> >
> > @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct
> > ether_addr *mac_addr) }
> >
> > static int
> > +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev
> > +*dev) {
> > + uint32_t gso_types;
> > + char pool_name[64];
> > +
> > + /*
> > + * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
> > + * size per mbuf use this pool for both direct and indirect mbufs
> > + */
> > +
> > + struct rte_mempool *mp; /* Mempool for GSO packets */
> > + /* initialize GSO context */
> > + gso_types = DEV_TX_OFFLOAD_TCP_TSO;
> > + snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device-
> >name);
> > + mp = rte_mempool_lookup((const char *)pool_name);
> > + if (!mp) {
> > + mp = rte_pktmbuf_pool_create(pool_name,
> TAP_GSO_MBUFS_NUM,
> > + 0, 0, RTE_PKTMBUF_HEADROOM +
> TAP_GSO_MBUF_SEG_SIZE,
> > + SOCKET_ID_ANY);
>
> You have setup the mempool with no cache size, which means you have to
> take a lock for each allocate. This could changed to have a small cache per
> lcore say 8, but the total number of mbufs needs to be large enough to not
> allow starvation for a lcore. total_mbufs = (max_num_ports * cache_size) +
> some_extra mbufs;
>
I will set cache_size as 4.
The total_mbufs should be mbufs_per_core(128) * cache_size(4) where the max_num_ports
is already taken into consideration in mbufs_per_core. For example, for a TCP packet of 1024 bytes and TSO max seg size of 256 bytes GSO will allocate 5 mbufs (one direct and four indirect) regardless of the number of ports.
> > + if (!mp) {
> > + struct pmd_internals *pmd = dev->data-
> >dev_private;
> > + RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf
> pool for device %s\n",
> > + pmd->name, dev->device->name);
> > + return -1;
> > + }
> > + }
> > +
> > + gso_ctx->direct_pool = mp;
> > + gso_ctx->indirect_pool = mp;
> > + gso_ctx->gso_types = gso_types;
> > + gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
> > + gso_ctx->flag = 0;
> > +
> > + return 0;
> > +}
> > +
> > +static int
> > tap_setup_queue(struct rte_eth_dev *dev,
> > struct pmd_internals *internals,
> > uint16_t qid,
> > int is_rx)
> > {
> > + int ret;
> > int *fd;
> > int *other_fd;
> > const char *dir;
> > struct pmd_internals *pmd = dev->data->dev_private;
> > struct rx_queue *rx = &internals->rxq[qid];
> > struct tx_queue *tx = &internals->txq[qid];
> > + struct rte_gso_ctx *gso_ctx;
> >
> > if (is_rx) {
> > fd = &rx->fd;
> > other_fd = &tx->fd;
> > dir = "rx";
> > + gso_ctx = NULL;
> > } else {
> > fd = &tx->fd;
> > other_fd = &rx->fd;
> > dir = "tx";
> > + gso_ctx = &tx->gso_ctx;
> > }
> > if (*fd != -1) {
> > /* fd for this queue already exists */
> > TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
> > pmd->name, *fd, dir, qid);
> > + gso_ctx = NULL;
> > } else if (*other_fd != -1) {
> > /* Only other_fd exists. dup it */
> > *fd = dup(*other_fd);
> > @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
> >
> > tx->mtu = &dev->data->mtu;
> > rx->rxmode = &dev->data->dev_conf.rxmode;
> > + if (gso_ctx) {
> > + ret = tap_gso_ctx_setup(gso_ctx, dev);
> > + if (ret)
> > + return -1;
> > + }
> >
> > tx->type = pmd->type;
> >
> > diff --git a/drivers/net/tap/rte_eth_tap.h
> > b/drivers/net/tap/rte_eth_tap.h index 7b21d0d..44e2773 100644
> > --- a/drivers/net/tap/rte_eth_tap.h
> > +++ b/drivers/net/tap/rte_eth_tap.h
> > @@ -15,6 +15,7 @@
> >
> > #include <rte_ethdev_driver.h>
> > #include <rte_ether.h>
> > +#include <rte_gso.h>
> > #include "tap_log.h"
> >
> > #ifdef IFF_MULTI_QUEUE
> > @@ -22,6 +23,7 @@
> > #else
> > #define RTE_PMD_TAP_MAX_QUEUES 1
> > #endif
> > +#define MAX_GSO_MBUFS 64
> >
> > enum rte_tuntap_type {
> > ETH_TUNTAP_TYPE_UNKNOWN,
> > @@ -59,6 +61,7 @@ struct tx_queue {
> > uint16_t *mtu; /* Pointer to MTU from dev_data */
> > uint16_t csum:1; /* Enable checksum offloading */
> > struct pkt_stats stats; /* Stats for this TX queue */
> > + struct rte_gso_ctx gso_ctx; /* GSO context */
> > };
> >
> > struct pmd_internals {
> > diff --git a/mk/rte.app.mk b/mk/rte.app.mk index 1e32c83..e2ee879
> > 100644
> > --- a/mk/rte.app.mk
> > +++ b/mk/rte.app.mk
> > @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) += -
> lrte_port
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
> > -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
> > -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
> > # librte_acl needs --whole-archive because of weak functions @@ -61,6
> > +59,8 @@ endif _LDLIBS-y += --whole-archive
> >
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) += -lrte_cfgfile
> > +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
> > +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) += -lrte_hash
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) += -lrte_member
> > _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lrte_vhost
> > --
> > 2.7.4
> >
>
> Regards,
> Keith
> On Jun 14, 2018, at 2:59 AM, Ophir Munk <ophirmu@mellanox.com> wrote:
>
>
>
>> -----Original Message-----
>> From: Wiles, Keith [mailto:keith.wiles@intel.com]
>> Sent: Wednesday, June 13, 2018 7:04 PM
>> To: Ophir Munk <ophirmu@mellanox.com>
>> Cc: DPDK <dev@dpdk.org>; Pascal Mazon <pascal.mazon@6wind.com>;
>> Thomas Monjalon <thomas@monjalon.net>; Olga Shern
>> <olgas@mellanox.com>
>> Subject: Re: [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload)
>>
>>
>>
>>> On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophirmu@mellanox.com>
>> wrote:
>>>
>>> This commit implements TCP segmentation offload in TAP.
>>> librte_gso library is used to segment large TCP payloads (e.g. packets
>>> of 64K bytes size) into smaller MTU size buffers.
>>> By supporting TSO offload capability in software a TAP device can be
>>> used as a failsafe sub device and be paired with another PCI device
>>> which supports TSO capability in HW.
>>>
>>> For more details on librte_gso implementation please refer to dpdk
>>> documentation.
>>> The number of newly generated TCP TSO segments is limited to 64.
>>>
>>> Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
>>> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
>>> ---
>>> drivers/net/tap/Makefile | 2 +-
>>> drivers/net/tap/rte_eth_tap.c | 159
>> +++++++++++++++++++++++++++++++++++-------
>>> drivers/net/tap/rte_eth_tap.h | 3 +
>>> mk/rte.app.mk | 4 +-
>>> 4 files changed, 138 insertions(+), 30 deletions(-)
>>
>> You have setup the mempool with no cache size, which means you have to
>> take a lock for each allocate. This could changed to have a small cache per
>> lcore say 8, but the total number of mbufs needs to be large enough to not
>> allow starvation for a lcore. total_mbufs = (max_num_ports * cache_size) +
>> some_extra mbufs;
>>
>
> I will set cache_size as 4.
> The total_mbufs should be mbufs_per_core(128) * cache_size(4) where the max_num_ports
> is already taken into consideration in mbufs_per_core. For example, for a TCP packet of 1024 bytes and TSO max seg size of 256 bytes GSO will allocate 5 mbufs (one direct and four indirect) regardless of the number of ports.
Sounds good, thanks.
Regards,
Keith
@@ -24,7 +24,7 @@ CFLAGS += -I.
CFLAGS += $(WERROR_FLAGS)
LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
@@ -17,6 +17,7 @@
#include <rte_ip.h>
#include <rte_string_fns.h>
+#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
@@ -55,6 +56,9 @@
#define ETH_TAP_CMP_MAC_FMT "0123456789ABCDEFabcdef"
#define ETH_TAP_MAC_ARG_FMT ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
+#define TAP_GSO_MBUFS_NUM 128
+#define TAP_GSO_MBUF_SEG_SIZE 128
+
static struct rte_vdev_driver pmd_tap_drv;
static struct rte_vdev_driver pmd_tun_drv;
@@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
return DEV_TX_OFFLOAD_MULTI_SEGS |
DEV_TX_OFFLOAD_IPV4_CKSUM |
DEV_TX_OFFLOAD_UDP_CKSUM |
- DEV_TX_OFFLOAD_TCP_CKSUM;
+ DEV_TX_OFFLOAD_TCP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_TSO;
}
/* Finalize l4 checksum calculation */
@@ -479,23 +484,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
}
}
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+ struct rte_mbuf **pmbufs, uint16_t l234_hlen,
+ uint16_t *num_packets, unsigned long *num_tx_bytes)
{
- struct tx_queue *txq = queue;
- uint16_t num_tx = 0;
- unsigned long num_tx_bytes = 0;
- uint32_t max_size;
int i;
- if (unlikely(nb_pkts == 0))
- return 0;
-
- max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
- for (i = 0; i < nb_pkts; i++) {
- struct rte_mbuf *mbuf = bufs[num_tx];
+ for (i = 0; i < num_mbufs; i++) {
+ struct rte_mbuf *mbuf = pmbufs[i];
struct iovec iovecs[mbuf->nb_segs + 2];
struct tun_pi pi = { .flags = 0, .proto = 0x00 };
struct rte_mbuf *seg = mbuf;
@@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
int proto;
int n;
int j;
- int k; /* first index in iovecs for copying segments */
- uint16_t l234_hlen; /* length of layers 2,3,4 headers */
+ int k; /* current index in iovecs for copying segments */
uint16_t seg_len; /* length of first segment */
uint16_t nb_segs;
uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
@@ -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
uint16_t is_cksum = 0; /* in case cksum should be offloaded */
- /* stats.errs will be incremented */
- if (rte_pktmbuf_pkt_len(mbuf) > max_size)
- break;
-
l4_cksum = NULL;
if (txq->type == ETH_TUNTAP_TYPE_TUN) {
/*
@@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
if (seg_len < l234_hlen)
break;
-
- /* To change checksums, work on a
- * copy of l2, l3 l4 headers.
+ /* To change checksums, work on a * copy of l2, l3
+ * headers + l4 pseudo header
*/
rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
l234_hlen);
@@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
n = writev(txq->fd, iovecs, j);
if (n <= 0)
break;
+ (*num_packets)++;
+ (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+ }
+}
+
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct tx_queue *txq = queue;
+ uint16_t num_tx = 0;
+ uint16_t num_packets = 0;
+ unsigned long num_tx_bytes = 0;
+ uint16_t tso_segsz = 0;
+ uint16_t hdrs_len;
+ uint32_t max_size;
+ int i;
+ uint64_t tso;
+ int ret;
+ if (unlikely(nb_pkts == 0))
+ return 0;
+
+ struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+ max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+ for (i = 0; i < nb_pkts; i++) {
+ struct rte_mbuf *mbuf_in = bufs[num_tx];
+ struct rte_mbuf **mbuf;
+ uint16_t num_mbufs;
+
+ tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+ if (tso) {
+ struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+ assert(gso_ctx != NULL);
+ /* TCP segmentation implies TCP checksum offload */
+ mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
+ /* gso size is calculated without ETHER_CRC_LEN */
+ hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
+ mbuf_in->l4_len;
+ tso_segsz = mbuf_in->tso_segsz + hdrs_len;
+ if (unlikely(tso_segsz == hdrs_len) ||
+ tso_segsz > *txq->mtu) {
+ txq->stats.errs++;
+ break;
+ }
+ gso_ctx->gso_size = tso_segsz;
+ ret = rte_gso_segment(mbuf_in, /* packet to segment */
+ gso_ctx, /* gso control block */
+ (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+ RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+ /* ret contains the number of new created mbufs */
+ if (ret < 0)
+ break;
+
+ mbuf = gso_mbufs;
+ num_mbufs = ret;
+ } else {
+ /* stats.errs will be incremented */
+ if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+ break;
+
+ mbuf = &mbuf_in;
+ num_mbufs = 1;
+ }
+
+ tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
+ &num_packets, &num_tx_bytes);
num_tx++;
- num_tx_bytes += mbuf->pkt_len;
- rte_pktmbuf_free(mbuf);
+ rte_pktmbuf_free(mbuf_in);
}
- txq->stats.opackets += num_tx;
+ txq->stats.opackets += num_packets;
txq->stats.errs += nb_pkts - num_tx;
txq->stats.obytes += num_tx_bytes;
@@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
}
static int
+tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
+{
+ uint32_t gso_types;
+ char pool_name[64];
+
+ /*
+ * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
+ * size per mbuf use this pool for both direct and indirect mbufs
+ */
+
+ struct rte_mempool *mp; /* Mempool for GSO packets */
+ /* initialize GSO context */
+ gso_types = DEV_TX_OFFLOAD_TCP_TSO;
+ snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
+ mp = rte_mempool_lookup((const char *)pool_name);
+ if (!mp) {
+ mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
+ 0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
+ SOCKET_ID_ANY);
+ if (!mp) {
+ struct pmd_internals *pmd = dev->data->dev_private;
+ RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
+ pmd->name, dev->device->name);
+ return -1;
+ }
+ }
+
+ gso_ctx->direct_pool = mp;
+ gso_ctx->indirect_pool = mp;
+ gso_ctx->gso_types = gso_types;
+ gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
+ gso_ctx->flag = 0;
+
+ return 0;
+}
+
+static int
tap_setup_queue(struct rte_eth_dev *dev,
struct pmd_internals *internals,
uint16_t qid,
int is_rx)
{
+ int ret;
int *fd;
int *other_fd;
const char *dir;
struct pmd_internals *pmd = dev->data->dev_private;
struct rx_queue *rx = &internals->rxq[qid];
struct tx_queue *tx = &internals->txq[qid];
+ struct rte_gso_ctx *gso_ctx;
if (is_rx) {
fd = &rx->fd;
other_fd = &tx->fd;
dir = "rx";
+ gso_ctx = NULL;
} else {
fd = &tx->fd;
other_fd = &rx->fd;
dir = "tx";
+ gso_ctx = &tx->gso_ctx;
}
if (*fd != -1) {
/* fd for this queue already exists */
TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
pmd->name, *fd, dir, qid);
+ gso_ctx = NULL;
} else if (*other_fd != -1) {
/* Only other_fd exists. dup it */
*fd = dup(*other_fd);
@@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
tx->mtu = &dev->data->mtu;
rx->rxmode = &dev->data->dev_conf.rxmode;
+ if (gso_ctx) {
+ ret = tap_gso_ctx_setup(gso_ctx, dev);
+ if (ret)
+ return -1;
+ }
tx->type = pmd->type;
@@ -15,6 +15,7 @@
#include <rte_ethdev_driver.h>
#include <rte_ether.h>
+#include <rte_gso.h>
#include "tap_log.h"
#ifdef IFF_MULTI_QUEUE
@@ -22,6 +23,7 @@
#else
#define RTE_PMD_TAP_MAX_QUEUES 1
#endif
+#define MAX_GSO_MBUFS 64
enum rte_tuntap_type {
ETH_TUNTAP_TYPE_UNKNOWN,
@@ -59,6 +61,7 @@ struct tx_queue {
uint16_t *mtu; /* Pointer to MTU from dev_data */
uint16_t csum:1; /* Enable checksum offloading */
struct pkt_stats stats; /* Stats for this TX queue */
+ struct rte_gso_ctx gso_ctx; /* GSO context */
};
struct pmd_internals {
@@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) += -lrte_port
_LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump
_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor
_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter
_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm
# librte_acl needs --whole-archive because of weak functions
@@ -61,6 +59,8 @@ endif
_LDLIBS-y += --whole-archive
_LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) += -lrte_cfgfile
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso
_LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) += -lrte_hash
_LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) += -lrte_member
_LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lrte_vhost