[v4] net/bonding: per-slave intermediate rx ring

Message ID 20180816133208.26566-1-bluca@debian.org (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series [v4] net/bonding: per-slave intermediate rx ring |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Luca Boccassi Aug. 16, 2018, 1:32 p.m. UTC
  During bond 802.3ad receive, a burst of packets is fetched from
each slave into a local array and appended to per-slave ring buffer.
Packets are taken from the head of the ring buffer and returned to
the caller.  The number of mbufs provided to each slave is sufficient
to meet the requirements of the ixgbe vector receive.

This small change improves performances of the bonding driver
considerably. Vyatta has been using it for years in production.

Cc: stable@dpdk.org

Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
Signed-off-by: Luca Boccassi <bluca@debian.org>
---
v2 and v3: fix checkpatch warnings
v4: add Eric's original signed-off-by from the Vyatta internal repo

 drivers/net/bonding/rte_eth_bond_api.c     | 13 +++
 drivers/net/bonding/rte_eth_bond_pmd.c     | 98 +++++++++++++++++-----
 drivers/net/bonding/rte_eth_bond_private.h |  4 +
 3 files changed, 95 insertions(+), 20 deletions(-)
  

Comments

Chas Williams Aug. 20, 2018, 2:11 p.m. UTC | #1
This will need to be implemented for some of the other RX
burst methods at some point for other modes to see this
performance improvement (with the exception of active-backup).

On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <bluca@debian.org> wrote:

> During bond 802.3ad receive, a burst of packets is fetched from
> each slave into a local array and appended to per-slave ring buffer.
> Packets are taken from the head of the ring buffer and returned to
> the caller.  The number of mbufs provided to each slave is sufficient
> to meet the requirements of the ixgbe vector receive.
>
> This small change improves performances of the bonding driver
> considerably. Vyatta has been using it for years in production.
>
> Cc: stable@dpdk.org
>
> Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
> Signed-off-by: Luca Boccassi <bluca@debian.org>
>

Acked-by: Chas Williams <chas3@att.com>



> ---
> v2 and v3: fix checkpatch warnings
> v4: add Eric's original signed-off-by from the Vyatta internal repo
>
>  drivers/net/bonding/rte_eth_bond_api.c     | 13 +++
>  drivers/net/bonding/rte_eth_bond_pmd.c     | 98 +++++++++++++++++-----
>  drivers/net/bonding/rte_eth_bond_private.h |  4 +
>  3 files changed, 95 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/bonding/rte_eth_bond_api.c
> b/drivers/net/bonding/rte_eth_bond_api.c
> index 8bc04cfd11..3d22203e91 100644
> --- a/drivers/net/bonding/rte_eth_bond_api.c
> +++ b/drivers/net/bonding/rte_eth_bond_api.c
> @@ -524,6 +524,10 @@ __eth_bond_slave_remove_lock_free(uint16_t
> bonded_port_id,
>
> sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs)));
>         }
>         if (internals->slave_count == 0) {
> +               /* Remove any remaining packets in the receive ring */
> +               struct rte_mbuf *bufs[PMD_BOND_RECV_PKTS_PER_SLAVE];
> +               unsigned int j, count;
> +
>                 internals->rx_offload_capa = 0;
>                 internals->tx_offload_capa = 0;
>                 internals->rx_queue_offload_capa = 0;
> @@ -532,6 +536,15 @@ __eth_bond_slave_remove_lock_free(uint16_t
> bonded_port_id,
>                 internals->reta_size = 0;
>                 internals->candidate_max_rx_pktlen = 0;
>                 internals->max_rx_pktlen = 0;
> +
> +               do {
> +                       count = rte_ring_dequeue_burst(internals->rx_ring,
> +                                       (void **)bufs,
> +                                       PMD_BOND_RECV_PKTS_PER_SLAVE,
> +                                       NULL);
> +                       for  (j = 0; j < count; j++)
> +                               rte_pktmbuf_free(bufs[j]);
> +               } while (count > 0);
>         }
>         return 0;
>  }
> diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c
> b/drivers/net/bonding/rte_eth_bond_pmd.c
> index 58f7377c60..ae756c4e3a 100644
> --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> @@ -18,6 +18,8 @@
>  #include <rte_alarm.h>
>  #include <rte_cycles.h>
>  #include <rte_string_fns.h>
> +#include <rte_errno.h>
> +#include <rte_lcore.h>
>
>  #include "rte_eth_bond.h"
>  #include "rte_eth_bond_private.h"
> @@ -402,10 +404,15 @@ bond_ethdev_rx_burst_8023ad(void *queue, struct
> rte_mbuf **bufs,
>         struct bond_dev_private *internals = bd_rx_q->dev_private;
>         struct ether_addr bond_mac;
>
> +       unsigned int rx_ring_avail =
> rte_ring_free_count(internals->rx_ring);
> +       struct rte_mbuf *mbuf_bounce[PMD_BOND_RECV_PKTS_PER_SLAVE];
> +
>         struct ether_hdr *hdr;
>
>         const uint16_t ether_type_slow_be =
> rte_be_to_cpu_16(ETHER_TYPE_SLOW);
>         uint16_t num_rx_total = 0;      /* Total number of received
> packets */
> +       uint16_t num_rx_slave;
> +       uint16_t num_enq_slave;
>         uint16_t slaves[RTE_MAX_ETHPORTS];
>         uint16_t slave_count, idx;
>
> @@ -414,6 +421,9 @@ bond_ethdev_rx_burst_8023ad(void *queue, struct
> rte_mbuf **bufs,
>         uint8_t i, j, k;
>         uint8_t subtype;
>
> +       if (rx_ring_avail < PMD_BOND_RECV_PKTS_PER_SLAVE)
> +               goto dequeue;
> +
>         rte_eth_macaddr_get(internals->port_id, &bond_mac);
>         /* Copy slave list to protect against slave up/down changes during
> tx
>          * bursting */
> @@ -426,62 +436,96 @@ bond_ethdev_rx_burst_8023ad(void *queue, struct
> rte_mbuf **bufs,
>                 internals->active_slave = 0;
>                 idx = 0;
>         }
> -       for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
> -               j = num_rx_total;
> +       for (i = 0; i < slave_count && num_rx_total < rx_ring_avail; i++) {
> +               j = 0;
>                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
>                                          COLLECTING);
>
>                 /* Read packets from this slave */
> -               num_rx_total += rte_eth_rx_burst(slaves[idx],
> bd_rx_q->queue_id,
> -                               &bufs[num_rx_total], nb_pkts -
> num_rx_total);
> +               if (unlikely(rx_ring_avail - num_rx_total <
> +                               PMD_BOND_RECV_PKTS_PER_SLAVE))
> +                       continue;
> +               num_rx_slave = rte_eth_rx_burst(slaves[idx],
> bd_rx_q->queue_id,
> +                               mbuf_bounce, PMD_BOND_RECV_PKTS_PER_SLAVE);
>
> -               for (k = j; k < 2 && k < num_rx_total; k++)
> -                       rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
> +               for (k = j; k < 2 && k < num_rx_slave; k++)
> +                       rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[k],
> void *));
>
>                 /* Handle slow protocol packets. */
> -               while (j < num_rx_total) {
> +               while (j < num_rx_slave) {
>
>                         /* If packet is not pure L2 and is known, skip it
> */
> -                       if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER)
> != 0) {
> +                       if ((mbuf_bounce[j]->packet_type &
> ~RTE_PTYPE_L2_ETHER) != 0) {
>                                 j++;
>                                 continue;
>                         }
>
> -                       if (j + 3 < num_rx_total)
> -                               rte_prefetch0(rte_pktmbuf_mtod(bufs[j +
> 3], void *));
> +                       if (j + 3 < num_rx_slave)
> +
>  rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[j + 3],
> +                                                              void *));
>
> -                       hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr
> *);
> +                       hdr = rte_pktmbuf_mtod(mbuf_bounce[j],
> +                                              struct ether_hdr *);
>                         subtype = ((struct slow_protocol_frame
> *)hdr)->slow_protocol.subtype;
>
>                         /* Remove packet from array if it is slow packet
> or slave is not
>                          * in collecting state or bonding interface is not
> in promiscuous
>                          * mode and packet address does not match. */
> -                       if (unlikely(is_lacp_packets(hdr->ether_type,
> subtype, bufs[j]) ||
> +                       if (unlikely(is_lacp_packets(hdr->ether_type,
> +                                                    subtype,
> mbuf_bounce[j]) ||
>                                 !collecting || (!promisc &&
>
> !is_multicast_ether_addr(&hdr->d_addr) &&
>                                         !is_same_ether_addr(&bond_mac,
> &hdr->d_addr)))) {
>
>                                 if (hdr->ether_type == ether_type_slow_be)
> {
>                                         bond_mode_8023ad_handle_slow_pkt(
> -                                           internals, slaves[idx],
> bufs[j]);
> +                                           internals, slaves[idx],
> +                                           mbuf_bounce[j]);
>                                 } else
> -                                       rte_pktmbuf_free(bufs[j]);
> +                                       rte_pktmbuf_free(mbuf_bounce[j]);
>
>                                 /* Packet is managed by mode 4 or dropped,
> shift the array */
> -                               num_rx_total--;
> -                               if (j < num_rx_total) {
> -                                       memmove(&bufs[j], &bufs[j + 1],
> sizeof(bufs[0]) *
> -                                               (num_rx_total - j));
> +                               num_rx_slave--;
> +                               if (j < num_rx_slave) {
> +                                       memmove(&mbuf_bounce[j],
> +                                               &mbuf_bounce[j + 1],
> +                                               sizeof(mbuf_bounce[0]) *
> +                                               (num_rx_slave - j));
>                                 }
> -                       } else
> +                       } else {
>                                 j++;
> +                       }
>                 }
> +
> +               if (num_rx_slave > 0) {
> +                       if (mbuf_bounce[0] == NULL)
> +                               RTE_LOG(ERR, PMD, "%s: Enqueue a NULL??\n",
> +                                       __func__);
> +
> +                       num_enq_slave =
> rte_ring_enqueue_burst(internals->rx_ring,
> +                                                              (void
> **)mbuf_bounce,
> +
> num_rx_slave,
> +                                                              NULL);
> +
> +                       if (num_enq_slave < num_rx_slave) {
> +                               RTE_LOG(ERR, PMD,
> +                                       "%s: failed to enqueue %u packets",
> +                                       __func__,
> +                                       (num_rx_slave - num_enq_slave));
> +                               for (j = num_enq_slave; j < num_rx_slave;
> j++)
> +                                       rte_pktmbuf_free(mbuf_bounce[j]);
> +                       }
> +                       num_rx_total += num_enq_slave;
> +               }
> +
>                 if (unlikely(++idx == slave_count))
>                         idx = 0;
>         }
>
>         internals->active_slave = idx;
> -       return num_rx_total;
> +dequeue:
> +       return rte_ring_dequeue_burst(internals->rx_ring, (void **)bufs,
> +                                     nb_pkts, NULL);
>  }
>
>  #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) ||
> defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
> @@ -3065,6 +3109,7 @@ bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
>         struct bond_dev_private *internals = NULL;
>         struct rte_eth_dev *eth_dev = NULL;
>         uint32_t vlan_filter_bmp_size;
> +       char mem_name[RTE_ETH_NAME_MAX_LEN];
>
>         /* now do all data allocation - for eth_dev structure, dummy pci
> driver
>          * and internal (private) data
> @@ -3129,6 +3174,19 @@ bond_alloc(struct rte_vdev_device *dev, uint8_t
> mode)
>         TAILQ_INIT(&internals->flow_list);
>         internals->flow_isolated_valid = 0;
>
> +       snprintf(mem_name, RTE_DIM(mem_name), "bond_%u_rx",
> internals->port_id);
> +       internals->rx_ring = rte_ring_lookup(mem_name);
> +       if (internals->rx_ring == NULL) {
> +               internals->rx_ring = rte_ring_create(mem_name,
> +
> rte_align32pow2(PMD_BOND_RECV_RING_PKTS *
> +                                                    rte_lcore_count()),
> +                                    socket_id, 0);
> +               if (internals->rx_ring == NULL)
> +                       rte_panic("%s: Failed to create rx ring '%s':
> %s\n", name,
> +                                 mem_name, rte_strerror(rte_errno));
> +       }
> +
> +
>         /* Set mode 4 default configuration */
>         bond_mode_8023ad_setup(eth_dev, NULL);
>         if (bond_ethdev_mode_set(eth_dev, mode)) {
> diff --git a/drivers/net/bonding/rte_eth_bond_private.h
> b/drivers/net/bonding/rte_eth_bond_private.h
> index 43e0e448df..80261c6b14 100644
> --- a/drivers/net/bonding/rte_eth_bond_private.h
> +++ b/drivers/net/bonding/rte_eth_bond_private.h
> @@ -26,6 +26,8 @@
>  #define PMD_BOND_LSC_POLL_PERIOD_KVARG         ("lsc_poll_period_ms")
>  #define PMD_BOND_LINK_UP_PROP_DELAY_KVARG      ("up_delay")
>  #define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG    ("down_delay")
> +#define PMD_BOND_RECV_RING_PKTS                512
> +#define PMD_BOND_RECV_PKTS_PER_SLAVE           32
>
>  #define PMD_BOND_XMIT_POLICY_LAYER2_KVARG      ("l2")
>  #define PMD_BOND_XMIT_POLICY_LAYER23_KVARG     ("l23")
> @@ -175,6 +177,8 @@ struct bond_dev_private {
>
>         void *vlan_filter_bmpmem;               /* enabled vlan filter
> bitmap */
>         struct rte_bitmap *vlan_filter_bmp;
> +
> +       struct rte_ring *rx_ring;
>  };
>
>  extern const struct eth_dev_ops default_dev_ops;
> --
> 2.18.0
>
>
  
Matan Azrad Aug. 21, 2018, 10:56 a.m. UTC | #2
Hi

From: Chas Williams
> This will need to be implemented for some of the other RX burst methods at
> some point for other modes to see this performance improvement (with the
> exception of active-backup).

Yes, I think it should be done at least to bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.

> On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <bluca@debian.org> wrote:
> 
> > During bond 802.3ad receive, a burst of packets is fetched from each
> > slave into a local array and appended to per-slave ring buffer.
> > Packets are taken from the head of the ring buffer and returned to the
> > caller.  The number of mbufs provided to each slave is sufficient to
> > meet the requirements of the ixgbe vector receive.

Luca,

Can you explain these requirements of ixgbe?
Did you check for other vendor PMDs? It may hurt performance there..

> >
> > This small change improves performances of the bonding driver
> > considerably. Vyatta has been using it for years in production.
> >
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
> > Signed-off-by: Luca Boccassi <bluca@debian.org>
> >
> 
> Acked-by: Chas Williams <chas3@att.com>
> 
> 
> 
> > ---
> > v2 and v3: fix checkpatch warnings
> > v4: add Eric's original signed-off-by from the Vyatta internal repo
> >
> >  drivers/net/bonding/rte_eth_bond_api.c     | 13 +++
> >  drivers/net/bonding/rte_eth_bond_pmd.c     | 98 +++++++++++++++++--
> ---
> >  drivers/net/bonding/rte_eth_bond_private.h |  4 +
> >  3 files changed, 95 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/net/bonding/rte_eth_bond_api.c
> > b/drivers/net/bonding/rte_eth_bond_api.c
> > index 8bc04cfd11..3d22203e91 100644
> > --- a/drivers/net/bonding/rte_eth_bond_api.c
> > +++ b/drivers/net/bonding/rte_eth_bond_api.c
> > @@ -524,6 +524,10 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > bonded_port_id,
> >
> > sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs)));
> >         }
> >         if (internals->slave_count == 0) {
> > +               /* Remove any remaining packets in the receive ring */
> > +               struct rte_mbuf *bufs[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > +               unsigned int j, count;
> > +
> >                 internals->rx_offload_capa = 0;
> >                 internals->tx_offload_capa = 0;
> >                 internals->rx_queue_offload_capa = 0; @@ -532,6
> > +536,15 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > bonded_port_id,
> >                 internals->reta_size = 0;
> >                 internals->candidate_max_rx_pktlen = 0;
> >                 internals->max_rx_pktlen = 0;
> > +
> > +               do {
> > +                       count = rte_ring_dequeue_burst(internals->rx_ring,
> > +                                       (void **)bufs,
> > +                                       PMD_BOND_RECV_PKTS_PER_SLAVE,
> > +                                       NULL);
> > +                       for  (j = 0; j < count; j++)
> > +                               rte_pktmbuf_free(bufs[j]);
> > +               } while (count > 0);
> >         }
> >         return 0;
> >  }
> > diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c
> > b/drivers/net/bonding/rte_eth_bond_pmd.c
> > index 58f7377c60..ae756c4e3a 100644
> > --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> > +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> > @@ -18,6 +18,8 @@
> >  #include <rte_alarm.h>
> >  #include <rte_cycles.h>
> >  #include <rte_string_fns.h>
> > +#include <rte_errno.h>
> > +#include <rte_lcore.h>
> >
> >  #include "rte_eth_bond.h"
> >  #include "rte_eth_bond_private.h"
> > @@ -402,10 +404,15 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> struct
> > rte_mbuf **bufs,
> >         struct bond_dev_private *internals = bd_rx_q->dev_private;
> >         struct ether_addr bond_mac;
> >
> > +       unsigned int rx_ring_avail =
> > rte_ring_free_count(internals->rx_ring);
> > +       struct rte_mbuf
> *mbuf_bounce[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > +
> >         struct ether_hdr *hdr;
> >
> >         const uint16_t ether_type_slow_be =
> > rte_be_to_cpu_16(ETHER_TYPE_SLOW);
> >         uint16_t num_rx_total = 0;      /* Total number of received
> > packets */
> > +       uint16_t num_rx_slave;
> > +       uint16_t num_enq_slave;
> >         uint16_t slaves[RTE_MAX_ETHPORTS];
> >         uint16_t slave_count, idx;
> >
> > @@ -414,6 +421,9 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> struct
> > rte_mbuf **bufs,
> >         uint8_t i, j, k;
> >         uint8_t subtype;
> >
> > +       if (rx_ring_avail < PMD_BOND_RECV_PKTS_PER_SLAVE)
> > +               goto dequeue;
> > +
> >         rte_eth_macaddr_get(internals->port_id, &bond_mac);
> >         /* Copy slave list to protect against slave up/down changes
> > during tx
> >          * bursting */
> > @@ -426,62 +436,96 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> struct
> > rte_mbuf **bufs,
> >                 internals->active_slave = 0;
> >                 idx = 0;
> >         }
> > -       for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
> > -               j = num_rx_total;
> > +       for (i = 0; i < slave_count && num_rx_total < rx_ring_avail; i++) {
> > +               j = 0;
> >                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
> >                                          COLLECTING);
> >
> >                 /* Read packets from this slave */
> > -               num_rx_total += rte_eth_rx_burst(slaves[idx],
> > bd_rx_q->queue_id,
> > -                               &bufs[num_rx_total], nb_pkts -
> > num_rx_total);
> > +               if (unlikely(rx_ring_avail - num_rx_total <
> > +                               PMD_BOND_RECV_PKTS_PER_SLAVE))
> > +                       continue;
> > +               num_rx_slave = rte_eth_rx_burst(slaves[idx],
> > bd_rx_q->queue_id,
> > +                               mbuf_bounce,
> > + PMD_BOND_RECV_PKTS_PER_SLAVE);
> >
> > -               for (k = j; k < 2 && k < num_rx_total; k++)
> > -                       rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
> > +               for (k = j; k < 2 && k < num_rx_slave; k++)
> > +                       rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[k],
> > void *));
> >
> >                 /* Handle slow protocol packets. */
> > -               while (j < num_rx_total) {
> > +               while (j < num_rx_slave) {
> >
> >                         /* If packet is not pure L2 and is known, skip
> > it */
> > -                       if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER)
> > != 0) {
> > +                       if ((mbuf_bounce[j]->packet_type &
> > ~RTE_PTYPE_L2_ETHER) != 0) {
> >                                 j++;
> >                                 continue;
> >                         }
> >
> > -                       if (j + 3 < num_rx_total)
> > -                               rte_prefetch0(rte_pktmbuf_mtod(bufs[j +
> > 3], void *));
> > +                       if (j + 3 < num_rx_slave)
> > +
> >  rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[j + 3],
> > +                                                              void
> > + *));
> >
> > -                       hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr
> > *);
> > +                       hdr = rte_pktmbuf_mtod(mbuf_bounce[j],
> > +                                              struct ether_hdr *);
> >                         subtype = ((struct slow_protocol_frame
> > *)hdr)->slow_protocol.subtype;
> >
> >                         /* Remove packet from array if it is slow
> > packet or slave is not
> >                          * in collecting state or bonding interface is
> > not in promiscuous
> >                          * mode and packet address does not match. */
> > -                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > subtype, bufs[j]) ||
> > +                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > +                                                    subtype,
> > mbuf_bounce[j]) ||
> >                                 !collecting || (!promisc &&
> >
> > !is_multicast_ether_addr(&hdr->d_addr) &&
> >                                         !is_same_ether_addr(&bond_mac,
> > &hdr->d_addr)))) {
> >
> >                                 if (hdr->ether_type ==
> > ether_type_slow_be) {
> >                                         bond_mode_8023ad_handle_slow_pkt(
> > -                                           internals, slaves[idx],
> > bufs[j]);
> > +                                           internals, slaves[idx],
> > +                                           mbuf_bounce[j]);
> >                                 } else
> > -                                       rte_pktmbuf_free(bufs[j]);
> > +
> > + rte_pktmbuf_free(mbuf_bounce[j]);
> >
> >                                 /* Packet is managed by mode 4 or
> > dropped, shift the array */
> > -                               num_rx_total--;
> > -                               if (j < num_rx_total) {
> > -                                       memmove(&bufs[j], &bufs[j + 1],
> > sizeof(bufs[0]) *
> > -                                               (num_rx_total - j));
> > +                               num_rx_slave--;
> > +                               if (j < num_rx_slave) {
> > +                                       memmove(&mbuf_bounce[j],
> > +                                               &mbuf_bounce[j + 1],
> > +                                               sizeof(mbuf_bounce[0]) *
> > +                                               (num_rx_slave - j));
> >                                 }
> > -                       } else
> > +                       } else {
> >                                 j++;
> > +                       }
> >                 }
> > +
> > +               if (num_rx_slave > 0) {
> > +                       if (mbuf_bounce[0] == NULL)
> > +                               RTE_LOG(ERR, PMD, "%s: Enqueue a NULL??\n",
> > +                                       __func__);
> > +
> > +                       num_enq_slave =
> > rte_ring_enqueue_burst(internals->rx_ring,
> > +                                                              (void
> > **)mbuf_bounce,
> > +
> > num_rx_slave,
> > +                                                              NULL);
> > +
> > +                       if (num_enq_slave < num_rx_slave) {
> > +                               RTE_LOG(ERR, PMD,
> > +                                       "%s: failed to enqueue %u packets",
> > +                                       __func__,
> > +                                       (num_rx_slave - num_enq_slave));
> > +                               for (j = num_enq_slave; j <
> > + num_rx_slave;
> > j++)
> > +                                       rte_pktmbuf_free(mbuf_bounce[j]);
> > +                       }
> > +                       num_rx_total += num_enq_slave;
> > +               }
> > +
> >                 if (unlikely(++idx == slave_count))
> >                         idx = 0;
> >         }
> >
> >         internals->active_slave = idx;
> > -       return num_rx_total;
> > +dequeue:
> > +       return rte_ring_dequeue_burst(internals->rx_ring, (void **)bufs,
> > +                                     nb_pkts, NULL);
> >  }
> >
> >  #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) ||
> > defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
> > @@ -3065,6 +3109,7 @@ bond_alloc(struct rte_vdev_device *dev, uint8_t
> mode)
> >         struct bond_dev_private *internals = NULL;
> >         struct rte_eth_dev *eth_dev = NULL;
> >         uint32_t vlan_filter_bmp_size;
> > +       char mem_name[RTE_ETH_NAME_MAX_LEN];
> >
> >         /* now do all data allocation - for eth_dev structure, dummy
> > pci driver
> >          * and internal (private) data @@ -3129,6 +3174,19 @@
> > bond_alloc(struct rte_vdev_device *dev, uint8_t
> > mode)
> >         TAILQ_INIT(&internals->flow_list);
> >         internals->flow_isolated_valid = 0;
> >
> > +       snprintf(mem_name, RTE_DIM(mem_name), "bond_%u_rx",
> > internals->port_id);
> > +       internals->rx_ring = rte_ring_lookup(mem_name);
> > +       if (internals->rx_ring == NULL) {
> > +               internals->rx_ring = rte_ring_create(mem_name,
> > +
> > rte_align32pow2(PMD_BOND_RECV_RING_PKTS *
> > +                                                    rte_lcore_count()),
> > +                                    socket_id, 0);
> > +               if (internals->rx_ring == NULL)
> > +                       rte_panic("%s: Failed to create rx ring '%s':
> > %s\n", name,
> > +                                 mem_name, rte_strerror(rte_errno));
> > +       }
> > +
> > +
> >         /* Set mode 4 default configuration */
> >         bond_mode_8023ad_setup(eth_dev, NULL);
> >         if (bond_ethdev_mode_set(eth_dev, mode)) { diff --git
> > a/drivers/net/bonding/rte_eth_bond_private.h
> > b/drivers/net/bonding/rte_eth_bond_private.h
> > index 43e0e448df..80261c6b14 100644
> > --- a/drivers/net/bonding/rte_eth_bond_private.h
> > +++ b/drivers/net/bonding/rte_eth_bond_private.h
> > @@ -26,6 +26,8 @@
> >  #define PMD_BOND_LSC_POLL_PERIOD_KVARG
> ("lsc_poll_period_ms")
> >  #define PMD_BOND_LINK_UP_PROP_DELAY_KVARG      ("up_delay")
> >  #define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG
> ("down_delay")
> > +#define PMD_BOND_RECV_RING_PKTS                512
> > +#define PMD_BOND_RECV_PKTS_PER_SLAVE           32
> >
> >  #define PMD_BOND_XMIT_POLICY_LAYER2_KVARG      ("l2")
> >  #define PMD_BOND_XMIT_POLICY_LAYER23_KVARG     ("l23")
> > @@ -175,6 +177,8 @@ struct bond_dev_private {
> >
> >         void *vlan_filter_bmpmem;               /* enabled vlan filter
> > bitmap */
> >         struct rte_bitmap *vlan_filter_bmp;
> > +
> > +       struct rte_ring *rx_ring;
> >  };
> >
> >  extern const struct eth_dev_ops default_dev_ops;
> > --
> > 2.18.0
> >
> >
  
Luca Boccassi Aug. 21, 2018, 11:13 a.m. UTC | #3
On Tue, 2018-08-21 at 10:56 +0000, Matan Azrad wrote:
> Hi
> 
> From: Chas Williams
> > This will need to be implemented for some of the other RX burst
> > methods at
> > some point for other modes to see this performance improvement
> > (with the
> > exception of active-backup).
> 
> Yes, I think it should be done at least to
> bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
> 
> > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <bluca@debian.org>
> > wrote:
> > 
> > > During bond 802.3ad receive, a burst of packets is fetched from
> > > each
> > > slave into a local array and appended to per-slave ring buffer.
> > > Packets are taken from the head of the ring buffer and returned
> > > to the
> > > caller.  The number of mbufs provided to each slave is sufficient
> > > to
> > > meet the requirements of the ixgbe vector receive.
> 
> Luca,
> 
> Can you explain these requirements of ixgbe?

I think Chas knows this better than I do. Chas?

> Did you check for other vendor PMDs? It may hurt performance there..

Yeah we do support the appliance with more drivers - off the top of my
head at the very least e1000, i40e, bnxt, and I'm pretty sure there are
users that run with bonded virtio interfaces (yeah I know). No issues
were reported as far as I'm aware.
  
Chas Williams Aug. 21, 2018, 2:58 p.m. UTC | #4
On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <matan@mellanox.com> wrote:

> Hi
>
> From: Chas Williams
> > This will need to be implemented for some of the other RX burst methods
> at
> > some point for other modes to see this performance improvement (with the
> > exception of active-backup).
>
> Yes, I think it should be done at least to
> bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
>

There is some duplicated code between the various RX paths.
I would like to eliminate that as much as possible, so I was
going to give that some thought first.


>
> > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <bluca@debian.org> wrote:
> >
> > > During bond 802.3ad receive, a burst of packets is fetched from each
> > > slave into a local array and appended to per-slave ring buffer.
> > > Packets are taken from the head of the ring buffer and returned to the
> > > caller.  The number of mbufs provided to each slave is sufficient to
> > > meet the requirements of the ixgbe vector receive.
>
> Luca,
>
> Can you explain these requirements of ixgbe?
>

The ixgbe (and some other Intel PMDs) have vectorized RX
routines that are more efficient (if not faster) taking
advantage of some advanced CPU instructions.  I think
you need to be receiving at least 32 packets or more.



> Did you check for other vendor PMDs? It may hurt performance there..
>

I don't know, but I suspect probably not.  For the most
part you are typically reading almost up to the vector
requirement.  But if one slave has just a single packet,
then you can't vectorize on the next slave.

Empirically, this patch has been around of years in one
form or another.  No complaints other than "bonding is
slow" which is why it was written.


> > >
> > > This small change improves performances of the bonding driver
> > > considerably. Vyatta has been using it for years in production.
> > >
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
> > > Signed-off-by: Luca Boccassi <bluca@debian.org>
> > >
> >
> > Acked-by: Chas Williams <chas3@att.com>
> >
> >
> >
> > > ---
> > > v2 and v3: fix checkpatch warnings
> > > v4: add Eric's original signed-off-by from the Vyatta internal repo
> > >
> > >  drivers/net/bonding/rte_eth_bond_api.c     | 13 +++
> > >  drivers/net/bonding/rte_eth_bond_pmd.c     | 98 +++++++++++++++++--
> > ---
> > >  drivers/net/bonding/rte_eth_bond_private.h |  4 +
> > >  3 files changed, 95 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/drivers/net/bonding/rte_eth_bond_api.c
> > > b/drivers/net/bonding/rte_eth_bond_api.c
> > > index 8bc04cfd11..3d22203e91 100644
> > > --- a/drivers/net/bonding/rte_eth_bond_api.c
> > > +++ b/drivers/net/bonding/rte_eth_bond_api.c
> > > @@ -524,6 +524,10 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > > bonded_port_id,
> > >
> > > sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs)));
> > >         }
> > >         if (internals->slave_count == 0) {
> > > +               /* Remove any remaining packets in the receive ring */
> > > +               struct rte_mbuf *bufs[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > > +               unsigned int j, count;
> > > +
> > >                 internals->rx_offload_capa = 0;
> > >                 internals->tx_offload_capa = 0;
> > >                 internals->rx_queue_offload_capa = 0; @@ -532,6
> > > +536,15 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > > bonded_port_id,
> > >                 internals->reta_size = 0;
> > >                 internals->candidate_max_rx_pktlen = 0;
> > >                 internals->max_rx_pktlen = 0;
> > > +
> > > +               do {
> > > +                       count =
> rte_ring_dequeue_burst(internals->rx_ring,
> > > +                                       (void **)bufs,
> > > +                                       PMD_BOND_RECV_PKTS_PER_SLAVE,
> > > +                                       NULL);
> > > +                       for  (j = 0; j < count; j++)
> > > +                               rte_pktmbuf_free(bufs[j]);
> > > +               } while (count > 0);
> > >         }
> > >         return 0;
> > >  }
> > > diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c
> > > b/drivers/net/bonding/rte_eth_bond_pmd.c
> > > index 58f7377c60..ae756c4e3a 100644
> > > --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> > > +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> > > @@ -18,6 +18,8 @@
> > >  #include <rte_alarm.h>
> > >  #include <rte_cycles.h>
> > >  #include <rte_string_fns.h>
> > > +#include <rte_errno.h>
> > > +#include <rte_lcore.h>
> > >
> > >  #include "rte_eth_bond.h"
> > >  #include "rte_eth_bond_private.h"
> > > @@ -402,10 +404,15 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > struct
> > > rte_mbuf **bufs,
> > >         struct bond_dev_private *internals = bd_rx_q->dev_private;
> > >         struct ether_addr bond_mac;
> > >
> > > +       unsigned int rx_ring_avail =
> > > rte_ring_free_count(internals->rx_ring);
> > > +       struct rte_mbuf
> > *mbuf_bounce[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > > +
> > >         struct ether_hdr *hdr;
> > >
> > >         const uint16_t ether_type_slow_be =
> > > rte_be_to_cpu_16(ETHER_TYPE_SLOW);
> > >         uint16_t num_rx_total = 0;      /* Total number of received
> > > packets */
> > > +       uint16_t num_rx_slave;
> > > +       uint16_t num_enq_slave;
> > >         uint16_t slaves[RTE_MAX_ETHPORTS];
> > >         uint16_t slave_count, idx;
> > >
> > > @@ -414,6 +421,9 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > struct
> > > rte_mbuf **bufs,
> > >         uint8_t i, j, k;
> > >         uint8_t subtype;
> > >
> > > +       if (rx_ring_avail < PMD_BOND_RECV_PKTS_PER_SLAVE)
> > > +               goto dequeue;
> > > +
> > >         rte_eth_macaddr_get(internals->port_id, &bond_mac);
> > >         /* Copy slave list to protect against slave up/down changes
> > > during tx
> > >          * bursting */
> > > @@ -426,62 +436,96 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > struct
> > > rte_mbuf **bufs,
> > >                 internals->active_slave = 0;
> > >                 idx = 0;
> > >         }
> > > -       for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
> > > -               j = num_rx_total;
> > > +       for (i = 0; i < slave_count && num_rx_total < rx_ring_avail;
> i++) {
> > > +               j = 0;
> > >                 collecting =
> ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
> > >                                          COLLECTING);
> > >
> > >                 /* Read packets from this slave */
> > > -               num_rx_total += rte_eth_rx_burst(slaves[idx],
> > > bd_rx_q->queue_id,
> > > -                               &bufs[num_rx_total], nb_pkts -
> > > num_rx_total);
> > > +               if (unlikely(rx_ring_avail - num_rx_total <
> > > +                               PMD_BOND_RECV_PKTS_PER_SLAVE))
> > > +                       continue;
> > > +               num_rx_slave = rte_eth_rx_burst(slaves[idx],
> > > bd_rx_q->queue_id,
> > > +                               mbuf_bounce,
> > > + PMD_BOND_RECV_PKTS_PER_SLAVE);
> > >
> > > -               for (k = j; k < 2 && k < num_rx_total; k++)
> > > -                       rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void
> *));
> > > +               for (k = j; k < 2 && k < num_rx_slave; k++)
> > > +                       rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[k],
> > > void *));
> > >
> > >                 /* Handle slow protocol packets. */
> > > -               while (j < num_rx_total) {
> > > +               while (j < num_rx_slave) {
> > >
> > >                         /* If packet is not pure L2 and is known, skip
> > > it */
> > > -                       if ((bufs[j]->packet_type &
> ~RTE_PTYPE_L2_ETHER)
> > > != 0) {
> > > +                       if ((mbuf_bounce[j]->packet_type &
> > > ~RTE_PTYPE_L2_ETHER) != 0) {
> > >                                 j++;
> > >                                 continue;
> > >                         }
> > >
> > > -                       if (j + 3 < num_rx_total)
> > > -                               rte_prefetch0(rte_pktmbuf_mtod(bufs[j +
> > > 3], void *));
> > > +                       if (j + 3 < num_rx_slave)
> > > +
> > >  rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[j + 3],
> > > +                                                              void
> > > + *));
> > >
> > > -                       hdr = rte_pktmbuf_mtod(bufs[j], struct
> ether_hdr
> > > *);
> > > +                       hdr = rte_pktmbuf_mtod(mbuf_bounce[j],
> > > +                                              struct ether_hdr *);
> > >                         subtype = ((struct slow_protocol_frame
> > > *)hdr)->slow_protocol.subtype;
> > >
> > >                         /* Remove packet from array if it is slow
> > > packet or slave is not
> > >                          * in collecting state or bonding interface is
> > > not in promiscuous
> > >                          * mode and packet address does not match. */
> > > -                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > > subtype, bufs[j]) ||
> > > +                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > > +                                                    subtype,
> > > mbuf_bounce[j]) ||
> > >                                 !collecting || (!promisc &&
> > >
> > > !is_multicast_ether_addr(&hdr->d_addr) &&
> > >                                         !is_same_ether_addr(&bond_mac,
> > > &hdr->d_addr)))) {
> > >
> > >                                 if (hdr->ether_type ==
> > > ether_type_slow_be) {
> > >
>  bond_mode_8023ad_handle_slow_pkt(
> > > -                                           internals, slaves[idx],
> > > bufs[j]);
> > > +                                           internals, slaves[idx],
> > > +                                           mbuf_bounce[j]);
> > >                                 } else
> > > -                                       rte_pktmbuf_free(bufs[j]);
> > > +
> > > + rte_pktmbuf_free(mbuf_bounce[j]);
> > >
> > >                                 /* Packet is managed by mode 4 or
> > > dropped, shift the array */
> > > -                               num_rx_total--;
> > > -                               if (j < num_rx_total) {
> > > -                                       memmove(&bufs[j], &bufs[j + 1],
> > > sizeof(bufs[0]) *
> > > -                                               (num_rx_total - j));
> > > +                               num_rx_slave--;
> > > +                               if (j < num_rx_slave) {
> > > +                                       memmove(&mbuf_bounce[j],
> > > +                                               &mbuf_bounce[j + 1],
> > > +                                               sizeof(mbuf_bounce[0])
> *
> > > +                                               (num_rx_slave - j));
> > >                                 }
> > > -                       } else
> > > +                       } else {
> > >                                 j++;
> > > +                       }
> > >                 }
> > > +
> > > +               if (num_rx_slave > 0) {
> > > +                       if (mbuf_bounce[0] == NULL)
> > > +                               RTE_LOG(ERR, PMD, "%s: Enqueue a
> NULL??\n",
> > > +                                       __func__);
> > > +
> > > +                       num_enq_slave =
> > > rte_ring_enqueue_burst(internals->rx_ring,
> > > +                                                              (void
> > > **)mbuf_bounce,
> > > +
> > > num_rx_slave,
> > > +                                                              NULL);
> > > +
> > > +                       if (num_enq_slave < num_rx_slave) {
> > > +                               RTE_LOG(ERR, PMD,
> > > +                                       "%s: failed to enqueue %u
> packets",
> > > +                                       __func__,
> > > +                                       (num_rx_slave -
> num_enq_slave));
> > > +                               for (j = num_enq_slave; j <
> > > + num_rx_slave;
> > > j++)
> > > +
>  rte_pktmbuf_free(mbuf_bounce[j]);
> > > +                       }
> > > +                       num_rx_total += num_enq_slave;
> > > +               }
> > > +
> > >                 if (unlikely(++idx == slave_count))
> > >                         idx = 0;
> > >         }
> > >
> > >         internals->active_slave = idx;
> > > -       return num_rx_total;
> > > +dequeue:
> > > +       return rte_ring_dequeue_burst(internals->rx_ring, (void
> **)bufs,
> > > +                                     nb_pkts, NULL);
> > >  }
> > >
> > >  #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) ||
> > > defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
> > > @@ -3065,6 +3109,7 @@ bond_alloc(struct rte_vdev_device *dev, uint8_t
> > mode)
> > >         struct bond_dev_private *internals = NULL;
> > >         struct rte_eth_dev *eth_dev = NULL;
> > >         uint32_t vlan_filter_bmp_size;
> > > +       char mem_name[RTE_ETH_NAME_MAX_LEN];
> > >
> > >         /* now do all data allocation - for eth_dev structure, dummy
> > > pci driver
> > >          * and internal (private) data @@ -3129,6 +3174,19 @@
> > > bond_alloc(struct rte_vdev_device *dev, uint8_t
> > > mode)
> > >         TAILQ_INIT(&internals->flow_list);
> > >         internals->flow_isolated_valid = 0;
> > >
> > > +       snprintf(mem_name, RTE_DIM(mem_name), "bond_%u_rx",
> > > internals->port_id);
> > > +       internals->rx_ring = rte_ring_lookup(mem_name);
> > > +       if (internals->rx_ring == NULL) {
> > > +               internals->rx_ring = rte_ring_create(mem_name,
> > > +
> > > rte_align32pow2(PMD_BOND_RECV_RING_PKTS *
> > > +
> rte_lcore_count()),
> > > +                                    socket_id, 0);
> > > +               if (internals->rx_ring == NULL)
> > > +                       rte_panic("%s: Failed to create rx ring '%s':
> > > %s\n", name,
> > > +                                 mem_name, rte_strerror(rte_errno));
> > > +       }
> > > +
> > > +
> > >         /* Set mode 4 default configuration */
> > >         bond_mode_8023ad_setup(eth_dev, NULL);
> > >         if (bond_ethdev_mode_set(eth_dev, mode)) { diff --git
> > > a/drivers/net/bonding/rte_eth_bond_private.h
> > > b/drivers/net/bonding/rte_eth_bond_private.h
> > > index 43e0e448df..80261c6b14 100644
> > > --- a/drivers/net/bonding/rte_eth_bond_private.h
> > > +++ b/drivers/net/bonding/rte_eth_bond_private.h
> > > @@ -26,6 +26,8 @@
> > >  #define PMD_BOND_LSC_POLL_PERIOD_KVARG
> > ("lsc_poll_period_ms")
> > >  #define PMD_BOND_LINK_UP_PROP_DELAY_KVARG      ("up_delay")
> > >  #define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG
> > ("down_delay")
> > > +#define PMD_BOND_RECV_RING_PKTS                512
> > > +#define PMD_BOND_RECV_PKTS_PER_SLAVE           32
> > >
> > >  #define PMD_BOND_XMIT_POLICY_LAYER2_KVARG      ("l2")
> > >  #define PMD_BOND_XMIT_POLICY_LAYER23_KVARG     ("l23")
> > > @@ -175,6 +177,8 @@ struct bond_dev_private {
> > >
> > >         void *vlan_filter_bmpmem;               /* enabled vlan filter
> > > bitmap */
> > >         struct rte_bitmap *vlan_filter_bmp;
> > > +
> > > +       struct rte_ring *rx_ring;
> > >  };
> > >
> > >  extern const struct eth_dev_ops default_dev_ops;
> > > --
> > > 2.18.0
> > >
> > >
>
  
Matan Azrad Aug. 21, 2018, 3:43 p.m. UTC | #5
Hi Chas

From: Chas Williams
> On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <matan@mellanox.com>
> wrote:
> Hi
> 
> From: Chas Williams
> > This will need to be implemented for some of the other RX burst
> > methods at some point for other modes to see this performance
> > improvement (with the exception of active-backup).
> 
> Yes, I think it should be done at least to
> bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
> 
> There is some duplicated code between the various RX paths.
> I would like to eliminate that as much as possible, so I was going to give that
> some thought first.

There is no reason to stay this function as is while its twin is changed.

> 
> 
> > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <bluca@debian.org> wrote:
> >
> > > During bond 802.3ad receive, a burst of packets is fetched from each
> > > slave into a local array and appended to per-slave ring buffer.
> > > Packets are taken from the head of the ring buffer and returned to
> > > the caller.  The number of mbufs provided to each slave is
> > > sufficient to meet the requirements of the ixgbe vector receive.
> 
> Luca,
> 
> Can you explain these requirements of ixgbe?
> 
> The ixgbe (and some other Intel PMDs) have vectorized RX routines that are
> more efficient (if not faster) taking advantage of some advanced CPU
> instructions.  I think you need to be receiving at least 32 packets or more.

So, why to do it in bond which is a generic driver for all the vendors PMDs,
If for ixgbe and other Intel nics it is better you can force those PMDs to receive always 32 packets
and to manage a ring by themselves.
 
> Did you check for other vendor PMDs? It may hurt performance there..
> 
> I don't know, but I suspect probably not.  For the most part you are typically
> reading almost up to the vector requirement.  But if one slave has just a
> single packet, then you can't vectorize on the next slave.
> 

I don't think that the ring overhead is better for PMDs which are not using the vectorized instructions.

> Empirically, this patch has been around of years in one form or another.  No
> complaints other than "bonding is slow" which is why it was written.
> 
> 
> > >
> > > This small change improves performances of the bonding driver
> > > considerably. Vyatta has been using it for years in production.
> > >
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
> > > Signed-off-by: Luca Boccassi <bluca@debian.org>
> > >
> >
> > Acked-by: Chas Williams <chas3@att.com>
> >
> >
> >
> > > ---
> > > v2 and v3: fix checkpatch warnings
> > > v4: add Eric's original signed-off-by from the Vyatta internal repo
> > >
> > >  drivers/net/bonding/rte_eth_bond_api.c     | 13 +++
> > >  drivers/net/bonding/rte_eth_bond_pmd.c     | 98
> +++++++++++++++++--
> > ---
> > >  drivers/net/bonding/rte_eth_bond_private.h |  4 +
> > >  3 files changed, 95 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/drivers/net/bonding/rte_eth_bond_api.c
> > > b/drivers/net/bonding/rte_eth_bond_api.c
> > > index 8bc04cfd11..3d22203e91 100644
> > > --- a/drivers/net/bonding/rte_eth_bond_api.c
> > > +++ b/drivers/net/bonding/rte_eth_bond_api.c
> > > @@ -524,6 +524,10 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > > bonded_port_id,
> > >
> > > sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs)));
> > >         }
> > >         if (internals->slave_count == 0) {
> > > +               /* Remove any remaining packets in the receive ring */
> > > +               struct rte_mbuf *bufs[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > > +               unsigned int j, count;
> > > +
> > >                 internals->rx_offload_capa = 0;
> > >                 internals->tx_offload_capa = 0;
> > >                 internals->rx_queue_offload_capa = 0; @@ -532,6
> > > +536,15 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > > bonded_port_id,
> > >                 internals->reta_size = 0;
> > >                 internals->candidate_max_rx_pktlen = 0;
> > >                 internals->max_rx_pktlen = 0;
> > > +
> > > +               do {
> > > +                       count = rte_ring_dequeue_burst(internals->rx_ring,
> > > +                                       (void **)bufs,
> > > +                                       PMD_BOND_RECV_PKTS_PER_SLAVE,
> > > +                                       NULL);
> > > +                       for  (j = 0; j < count; j++)
> > > +                               rte_pktmbuf_free(bufs[j]);
> > > +               } while (count > 0);
> > >         }
> > >         return 0;
> > >  }
> > > diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c
> > > b/drivers/net/bonding/rte_eth_bond_pmd.c
> > > index 58f7377c60..ae756c4e3a 100644
> > > --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> > > +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> > > @@ -18,6 +18,8 @@
> > >  #include <rte_alarm.h>
> > >  #include <rte_cycles.h>
> > >  #include <rte_string_fns.h>
> > > +#include <rte_errno.h>
> > > +#include <rte_lcore.h>
> > >
> > >  #include "rte_eth_bond.h"
> > >  #include "rte_eth_bond_private.h"
> > > @@ -402,10 +404,15 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > struct
> > > rte_mbuf **bufs,
> > >         struct bond_dev_private *internals = bd_rx_q->dev_private;
> > >         struct ether_addr bond_mac;
> > >
> > > +       unsigned int rx_ring_avail =
> > > rte_ring_free_count(internals->rx_ring);
> > > +       struct rte_mbuf
> > *mbuf_bounce[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > > +
> > >         struct ether_hdr *hdr;
> > >
> > >         const uint16_t ether_type_slow_be =
> > > rte_be_to_cpu_16(ETHER_TYPE_SLOW);
> > >         uint16_t num_rx_total = 0;      /* Total number of received
> > > packets */
> > > +       uint16_t num_rx_slave;
> > > +       uint16_t num_enq_slave;
> > >         uint16_t slaves[RTE_MAX_ETHPORTS];
> > >         uint16_t slave_count, idx;
> > >
> > > @@ -414,6 +421,9 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > struct
> > > rte_mbuf **bufs,
> > >         uint8_t i, j, k;
> > >         uint8_t subtype;
> > >
> > > +       if (rx_ring_avail < PMD_BOND_RECV_PKTS_PER_SLAVE)
> > > +               goto dequeue;
> > > +
> > >         rte_eth_macaddr_get(internals->port_id, &bond_mac);
> > >         /* Copy slave list to protect against slave up/down changes
> > > during tx
> > >          * bursting */
> > > @@ -426,62 +436,96 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > struct
> > > rte_mbuf **bufs,
> > >                 internals->active_slave = 0;
> > >                 idx = 0;
> > >         }
> > > -       for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
> > > -               j = num_rx_total;
> > > +       for (i = 0; i < slave_count && num_rx_total < rx_ring_avail; i++) {
> > > +               j = 0;
> > >                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
> > >                                          COLLECTING);
> > >
> > >                 /* Read packets from this slave */
> > > -               num_rx_total += rte_eth_rx_burst(slaves[idx],
> > > bd_rx_q->queue_id,
> > > -                               &bufs[num_rx_total], nb_pkts -
> > > num_rx_total);
> > > +               if (unlikely(rx_ring_avail - num_rx_total <
> > > +                               PMD_BOND_RECV_PKTS_PER_SLAVE))
> > > +                       continue;
> > > +               num_rx_slave = rte_eth_rx_burst(slaves[idx],
> > > bd_rx_q->queue_id,
> > > +                               mbuf_bounce,
> > > + PMD_BOND_RECV_PKTS_PER_SLAVE);
> > >
> > > -               for (k = j; k < 2 && k < num_rx_total; k++)
> > > -                       rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
> > > +               for (k = j; k < 2 && k < num_rx_slave; k++)
> > > +
> > > + rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[k],
> > > void *));
> > >
> > >                 /* Handle slow protocol packets. */
> > > -               while (j < num_rx_total) {
> > > +               while (j < num_rx_slave) {
> > >
> > >                         /* If packet is not pure L2 and is known,
> > > skip it */
> > > -                       if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER)
> > > != 0) {
> > > +                       if ((mbuf_bounce[j]->packet_type &
> > > ~RTE_PTYPE_L2_ETHER) != 0) {
> > >                                 j++;
> > >                                 continue;
> > >                         }
> > >
> > > -                       if (j + 3 < num_rx_total)
> > > -                               rte_prefetch0(rte_pktmbuf_mtod(bufs[j +
> > > 3], void *));
> > > +                       if (j + 3 < num_rx_slave)
> > > +
> > >  rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[j + 3],
> > > +                                                              void
> > > + *));
> > >
> > > -                       hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr
> > > *);
> > > +                       hdr = rte_pktmbuf_mtod(mbuf_bounce[j],
> > > +                                              struct ether_hdr *);
> > >                         subtype = ((struct slow_protocol_frame
> > > *)hdr)->slow_protocol.subtype;
> > >
> > >                         /* Remove packet from array if it is slow
> > > packet or slave is not
> > >                          * in collecting state or bonding interface
> > > is not in promiscuous
> > >                          * mode and packet address does not match. */
> > > -                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > > subtype, bufs[j]) ||
> > > +                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > > +                                                    subtype,
> > > mbuf_bounce[j]) ||
> > >                                 !collecting || (!promisc &&
> > >
> > > !is_multicast_ether_addr(&hdr->d_addr) &&
> > >
> > > !is_same_ether_addr(&bond_mac,
> > > &hdr->d_addr)))) {
> > >
> > >                                 if (hdr->ether_type ==
> > > ether_type_slow_be) {
> > >                                         bond_mode_8023ad_handle_slow_pkt(
> > > -                                           internals, slaves[idx],
> > > bufs[j]);
> > > +                                           internals, slaves[idx],
> > > +                                           mbuf_bounce[j]);
> > >                                 } else
> > > -                                       rte_pktmbuf_free(bufs[j]);
> > > +
> > > + rte_pktmbuf_free(mbuf_bounce[j]);
> > >
> > >                                 /* Packet is managed by mode 4 or
> > > dropped, shift the array */
> > > -                               num_rx_total--;
> > > -                               if (j < num_rx_total) {
> > > -                                       memmove(&bufs[j], &bufs[j + 1],
> > > sizeof(bufs[0]) *
> > > -                                               (num_rx_total - j));
> > > +                               num_rx_slave--;
> > > +                               if (j < num_rx_slave) {
> > > +                                       memmove(&mbuf_bounce[j],
> > > +                                               &mbuf_bounce[j + 1],
> > > +                                               sizeof(mbuf_bounce[0]) *
> > > +                                               (num_rx_slave - j));
> > >                                 }
> > > -                       } else
> > > +                       } else {
> > >                                 j++;
> > > +                       }
> > >                 }
> > > +
> > > +               if (num_rx_slave > 0) {
> > > +                       if (mbuf_bounce[0] == NULL)
> > > +                               RTE_LOG(ERR, PMD, "%s: Enqueue a NULL??\n",
> > > +                                       __func__);
> > > +
> > > +                       num_enq_slave =
> > > rte_ring_enqueue_burst(internals->rx_ring,
> > > +                                                              (void
> > > **)mbuf_bounce,
> > > +
> > > num_rx_slave,
> > > +
> > > + NULL);
> > > +
> > > +                       if (num_enq_slave < num_rx_slave) {
> > > +                               RTE_LOG(ERR, PMD,
> > > +                                       "%s: failed to enqueue %u packets",
> > > +                                       __func__,
> > > +                                       (num_rx_slave - num_enq_slave));
> > > +                               for (j = num_enq_slave; j <
> > > + num_rx_slave;
> > > j++)
> > > +                                       rte_pktmbuf_free(mbuf_bounce[j]);
> > > +                       }
> > > +                       num_rx_total += num_enq_slave;
> > > +               }
> > > +
> > >                 if (unlikely(++idx == slave_count))
> > >                         idx = 0;
> > >         }
> > >
> > >         internals->active_slave = idx;
> > > -       return num_rx_total;
> > > +dequeue:
> > > +       return rte_ring_dequeue_burst(internals->rx_ring, (void **)bufs,
> > > +                                     nb_pkts, NULL);
> > >  }
> > >
> > >  #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) ||
> > > defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
> > > @@ -3065,6 +3109,7 @@ bond_alloc(struct rte_vdev_device *dev,
> > > uint8_t
> > mode)
> > >         struct bond_dev_private *internals = NULL;
> > >         struct rte_eth_dev *eth_dev = NULL;
> > >         uint32_t vlan_filter_bmp_size;
> > > +       char mem_name[RTE_ETH_NAME_MAX_LEN];
> > >
> > >         /* now do all data allocation - for eth_dev structure, dummy
> > > pci driver
> > >          * and internal (private) data @@ -3129,6 +3174,19 @@
> > > bond_alloc(struct rte_vdev_device *dev, uint8_t
> > > mode)
> > >         TAILQ_INIT(&internals->flow_list);
> > >         internals->flow_isolated_valid = 0;
> > >
> > > +       snprintf(mem_name, RTE_DIM(mem_name), "bond_%u_rx",
> > > internals->port_id);
> > > +       internals->rx_ring = rte_ring_lookup(mem_name);
> > > +       if (internals->rx_ring == NULL) {
> > > +               internals->rx_ring = rte_ring_create(mem_name,
> > > +
> > > rte_align32pow2(PMD_BOND_RECV_RING_PKTS *
> > > +                                                    rte_lcore_count()),
> > > +                                    socket_id, 0);
> > > +               if (internals->rx_ring == NULL)
> > > +                       rte_panic("%s: Failed to create rx ring '%s':
> > > %s\n", name,
> > > +                                 mem_name, rte_strerror(rte_errno));
> > > +       }
> > > +
> > > +
> > >         /* Set mode 4 default configuration */
> > >         bond_mode_8023ad_setup(eth_dev, NULL);
> > >         if (bond_ethdev_mode_set(eth_dev, mode)) { diff --git
> > > a/drivers/net/bonding/rte_eth_bond_private.h
> > > b/drivers/net/bonding/rte_eth_bond_private.h
> > > index 43e0e448df..80261c6b14 100644
> > > --- a/drivers/net/bonding/rte_eth_bond_private.h
> > > +++ b/drivers/net/bonding/rte_eth_bond_private.h
> > > @@ -26,6 +26,8 @@
> > >  #define PMD_BOND_LSC_POLL_PERIOD_KVARG
> > ("lsc_poll_period_ms")
> > >  #define PMD_BOND_LINK_UP_PROP_DELAY_KVARG      ("up_delay")
> > >  #define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG
> > ("down_delay")
> > > +#define PMD_BOND_RECV_RING_PKTS                512
> > > +#define PMD_BOND_RECV_PKTS_PER_SLAVE           32
> > >
> > >  #define PMD_BOND_XMIT_POLICY_LAYER2_KVARG      ("l2")
> > >  #define PMD_BOND_XMIT_POLICY_LAYER23_KVARG     ("l23")
> > > @@ -175,6 +177,8 @@ struct bond_dev_private {
> > >
> > >         void *vlan_filter_bmpmem;               /* enabled vlan filter
> > > bitmap */
> > >         struct rte_bitmap *vlan_filter_bmp;
> > > +
> > > +       struct rte_ring *rx_ring;
> > >  };
> > >
> > >  extern const struct eth_dev_ops default_dev_ops;
> > > --
> > > 2.18.0
> > >
> > >
  
Chas Williams Aug. 21, 2018, 6:19 p.m. UTC | #6
On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad <matan@mellanox.com> wrote:

> Hi Chas
>
> From: Chas Williams
> > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <matan@mellanox.com>
> > wrote:
> > Hi
> >
> > From: Chas Williams
> > > This will need to be implemented for some of the other RX burst
> > > methods at some point for other modes to see this performance
> > > improvement (with the exception of active-backup).
> >
> > Yes, I think it should be done at least to
> > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
> >
> > There is some duplicated code between the various RX paths.
> > I would like to eliminate that as much as possible, so I was going to
> give that
> > some thought first.
>
> There is no reason to stay this function as is while its twin is changed.
>

Unfortunately, this is all the patch I have at this time.


>
> >
> >
> > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <bluca@debian.org>
> wrote:
> > >
> > > > During bond 802.3ad receive, a burst of packets is fetched from each
> > > > slave into a local array and appended to per-slave ring buffer.
> > > > Packets are taken from the head of the ring buffer and returned to
> > > > the caller.  The number of mbufs provided to each slave is
> > > > sufficient to meet the requirements of the ixgbe vector receive.
> >
> > Luca,
> >
> > Can you explain these requirements of ixgbe?
> >
> > The ixgbe (and some other Intel PMDs) have vectorized RX routines that
> are
> > more efficient (if not faster) taking advantage of some advanced CPU
> > instructions.  I think you need to be receiving at least 32 packets or
> more.
>
> So, why to do it in bond which is a generic driver for all the vendors
> PMDs,
> If for ixgbe and other Intel nics it is better you can force those PMDs to
> receive always 32 packets
> and to manage a ring by themselves.
>

The drawback of the ring is some additional latency on the receive path.
In testing, the additional latency hasn't been an issue for bonding. The
bonding PMD has a fair bit of overhead associated with the RX and TX path
calculations.  Most applications can just arrange to call the RX path
with a sufficiently large receive.  Bonding can't do this.


>
> > Did you check for other vendor PMDs? It may hurt performance there..
> >
> > I don't know, but I suspect probably not.  For the most part you are
> typically
> > reading almost up to the vector requirement.  But if one slave has just a
> > single packet, then you can't vectorize on the next slave.
> >
>
> I don't think that the ring overhead is better for PMDs which are not
> using the vectorized instructions.
>

The non-vectorized PMDs are usually quite slow.  The additional
overhead doesn't make a difference in their performance.



> > Empirically, this patch has been around of years in one form or
> another.  No
> > complaints other than "bonding is slow" which is why it was written.
> >
> >
> > > >
> > > > This small change improves performances of the bonding driver
> > > > considerably. Vyatta has been using it for years in production.
> > > >
> > > > Cc: stable@dpdk.org
> > > >
> > > > Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
> > > > Signed-off-by: Luca Boccassi <bluca@debian.org>
> > > >
> > >
> > > Acked-by: Chas Williams <chas3@att.com>
> > >
> > >
> > >
> > > > ---
> > > > v2 and v3: fix checkpatch warnings
> > > > v4: add Eric's original signed-off-by from the Vyatta internal repo
> > > >
> > > >  drivers/net/bonding/rte_eth_bond_api.c     | 13 +++
> > > >  drivers/net/bonding/rte_eth_bond_pmd.c     | 98
> > +++++++++++++++++--
> > > ---
> > > >  drivers/net/bonding/rte_eth_bond_private.h |  4 +
> > > >  3 files changed, 95 insertions(+), 20 deletions(-)
> > > >
> > > > diff --git a/drivers/net/bonding/rte_eth_bond_api.c
> > > > b/drivers/net/bonding/rte_eth_bond_api.c
> > > > index 8bc04cfd11..3d22203e91 100644
> > > > --- a/drivers/net/bonding/rte_eth_bond_api.c
> > > > +++ b/drivers/net/bonding/rte_eth_bond_api.c
> > > > @@ -524,6 +524,10 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > > > bonded_port_id,
> > > >
> > > > sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs)));
> > > >         }
> > > >         if (internals->slave_count == 0) {
> > > > +               /* Remove any remaining packets in the receive ring
> */
> > > > +               struct rte_mbuf *bufs[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > > > +               unsigned int j, count;
> > > > +
> > > >                 internals->rx_offload_capa = 0;
> > > >                 internals->tx_offload_capa = 0;
> > > >                 internals->rx_queue_offload_capa = 0; @@ -532,6
> > > > +536,15 @@ __eth_bond_slave_remove_lock_free(uint16_t
> > > > bonded_port_id,
> > > >                 internals->reta_size = 0;
> > > >                 internals->candidate_max_rx_pktlen = 0;
> > > >                 internals->max_rx_pktlen = 0;
> > > > +
> > > > +               do {
> > > > +                       count =
> rte_ring_dequeue_burst(internals->rx_ring,
> > > > +                                       (void **)bufs,
> > > > +                                       PMD_BOND_RECV_PKTS_PER_SLAVE,
> > > > +                                       NULL);
> > > > +                       for  (j = 0; j < count; j++)
> > > > +                               rte_pktmbuf_free(bufs[j]);
> > > > +               } while (count > 0);
> > > >         }
> > > >         return 0;
> > > >  }
> > > > diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c
> > > > b/drivers/net/bonding/rte_eth_bond_pmd.c
> > > > index 58f7377c60..ae756c4e3a 100644
> > > > --- a/drivers/net/bonding/rte_eth_bond_pmd.c
> > > > +++ b/drivers/net/bonding/rte_eth_bond_pmd.c
> > > > @@ -18,6 +18,8 @@
> > > >  #include <rte_alarm.h>
> > > >  #include <rte_cycles.h>
> > > >  #include <rte_string_fns.h>
> > > > +#include <rte_errno.h>
> > > > +#include <rte_lcore.h>
> > > >
> > > >  #include "rte_eth_bond.h"
> > > >  #include "rte_eth_bond_private.h"
> > > > @@ -402,10 +404,15 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > > struct
> > > > rte_mbuf **bufs,
> > > >         struct bond_dev_private *internals = bd_rx_q->dev_private;
> > > >         struct ether_addr bond_mac;
> > > >
> > > > +       unsigned int rx_ring_avail =
> > > > rte_ring_free_count(internals->rx_ring);
> > > > +       struct rte_mbuf
> > > *mbuf_bounce[PMD_BOND_RECV_PKTS_PER_SLAVE];
> > > > +
> > > >         struct ether_hdr *hdr;
> > > >
> > > >         const uint16_t ether_type_slow_be =
> > > > rte_be_to_cpu_16(ETHER_TYPE_SLOW);
> > > >         uint16_t num_rx_total = 0;      /* Total number of received
> > > > packets */
> > > > +       uint16_t num_rx_slave;
> > > > +       uint16_t num_enq_slave;
> > > >         uint16_t slaves[RTE_MAX_ETHPORTS];
> > > >         uint16_t slave_count, idx;
> > > >
> > > > @@ -414,6 +421,9 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > > struct
> > > > rte_mbuf **bufs,
> > > >         uint8_t i, j, k;
> > > >         uint8_t subtype;
> > > >
> > > > +       if (rx_ring_avail < PMD_BOND_RECV_PKTS_PER_SLAVE)
> > > > +               goto dequeue;
> > > > +
> > > >         rte_eth_macaddr_get(internals->port_id, &bond_mac);
> > > >         /* Copy slave list to protect against slave up/down changes
> > > > during tx
> > > >          * bursting */
> > > > @@ -426,62 +436,96 @@ bond_ethdev_rx_burst_8023ad(void *queue,
> > > struct
> > > > rte_mbuf **bufs,
> > > >                 internals->active_slave = 0;
> > > >                 idx = 0;
> > > >         }
> > > > -       for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
> > > > -               j = num_rx_total;
> > > > +       for (i = 0; i < slave_count && num_rx_total < rx_ring_avail;
> i++) {
> > > > +               j = 0;
> > > >                 collecting =
> ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
> > > >                                          COLLECTING);
> > > >
> > > >                 /* Read packets from this slave */
> > > > -               num_rx_total += rte_eth_rx_burst(slaves[idx],
> > > > bd_rx_q->queue_id,
> > > > -                               &bufs[num_rx_total], nb_pkts -
> > > > num_rx_total);
> > > > +               if (unlikely(rx_ring_avail - num_rx_total <
> > > > +                               PMD_BOND_RECV_PKTS_PER_SLAVE))
> > > > +                       continue;
> > > > +               num_rx_slave = rte_eth_rx_burst(slaves[idx],
> > > > bd_rx_q->queue_id,
> > > > +                               mbuf_bounce,
> > > > + PMD_BOND_RECV_PKTS_PER_SLAVE);
> > > >
> > > > -               for (k = j; k < 2 && k < num_rx_total; k++)
> > > > -                       rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void
> *));
> > > > +               for (k = j; k < 2 && k < num_rx_slave; k++)
> > > > +
> > > > + rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[k],
> > > > void *));
> > > >
> > > >                 /* Handle slow protocol packets. */
> > > > -               while (j < num_rx_total) {
> > > > +               while (j < num_rx_slave) {
> > > >
> > > >                         /* If packet is not pure L2 and is known,
> > > > skip it */
> > > > -                       if ((bufs[j]->packet_type &
> ~RTE_PTYPE_L2_ETHER)
> > > > != 0) {
> > > > +                       if ((mbuf_bounce[j]->packet_type &
> > > > ~RTE_PTYPE_L2_ETHER) != 0) {
> > > >                                 j++;
> > > >                                 continue;
> > > >                         }
> > > >
> > > > -                       if (j + 3 < num_rx_total)
> > > > -
>  rte_prefetch0(rte_pktmbuf_mtod(bufs[j +
> > > > 3], void *));
> > > > +                       if (j + 3 < num_rx_slave)
> > > > +
> > > >  rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[j + 3],
> > > > +                                                              void
> > > > + *));
> > > >
> > > > -                       hdr = rte_pktmbuf_mtod(bufs[j], struct
> ether_hdr
> > > > *);
> > > > +                       hdr = rte_pktmbuf_mtod(mbuf_bounce[j],
> > > > +                                              struct ether_hdr *);
> > > >                         subtype = ((struct slow_protocol_frame
> > > > *)hdr)->slow_protocol.subtype;
> > > >
> > > >                         /* Remove packet from array if it is slow
> > > > packet or slave is not
> > > >                          * in collecting state or bonding interface
> > > > is not in promiscuous
> > > >                          * mode and packet address does not match. */
> > > > -                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > > > subtype, bufs[j]) ||
> > > > +                       if (unlikely(is_lacp_packets(hdr->ether_type,
> > > > +                                                    subtype,
> > > > mbuf_bounce[j]) ||
> > > >                                 !collecting || (!promisc &&
> > > >
> > > > !is_multicast_ether_addr(&hdr->d_addr) &&
> > > >
> > > > !is_same_ether_addr(&bond_mac,
> > > > &hdr->d_addr)))) {
> > > >
> > > >                                 if (hdr->ether_type ==
> > > > ether_type_slow_be) {
> > > >
>  bond_mode_8023ad_handle_slow_pkt(
> > > > -                                           internals, slaves[idx],
> > > > bufs[j]);
> > > > +                                           internals, slaves[idx],
> > > > +                                           mbuf_bounce[j]);
> > > >                                 } else
> > > > -                                       rte_pktmbuf_free(bufs[j]);
> > > > +
> > > > + rte_pktmbuf_free(mbuf_bounce[j]);
> > > >
> > > >                                 /* Packet is managed by mode 4 or
> > > > dropped, shift the array */
> > > > -                               num_rx_total--;
> > > > -                               if (j < num_rx_total) {
> > > > -                                       memmove(&bufs[j], &bufs[j +
> 1],
> > > > sizeof(bufs[0]) *
> > > > -                                               (num_rx_total - j));
> > > > +                               num_rx_slave--;
> > > > +                               if (j < num_rx_slave) {
> > > > +                                       memmove(&mbuf_bounce[j],
> > > > +                                               &mbuf_bounce[j + 1],
> > > > +
>  sizeof(mbuf_bounce[0]) *
> > > > +                                               (num_rx_slave - j));
> > > >                                 }
> > > > -                       } else
> > > > +                       } else {
> > > >                                 j++;
> > > > +                       }
> > > >                 }
> > > > +
> > > > +               if (num_rx_slave > 0) {
> > > > +                       if (mbuf_bounce[0] == NULL)
> > > > +                               RTE_LOG(ERR, PMD, "%s: Enqueue a
> NULL??\n",
> > > > +                                       __func__);
> > > > +
> > > > +                       num_enq_slave =
> > > > rte_ring_enqueue_burst(internals->rx_ring,
> > > > +                                                              (void
> > > > **)mbuf_bounce,
> > > > +
> > > > num_rx_slave,
> > > > +
> > > > + NULL);
> > > > +
> > > > +                       if (num_enq_slave < num_rx_slave) {
> > > > +                               RTE_LOG(ERR, PMD,
> > > > +                                       "%s: failed to enqueue %u
> packets",
> > > > +                                       __func__,
> > > > +                                       (num_rx_slave -
> num_enq_slave));
> > > > +                               for (j = num_enq_slave; j <
> > > > + num_rx_slave;
> > > > j++)
> > > > +
>  rte_pktmbuf_free(mbuf_bounce[j]);
> > > > +                       }
> > > > +                       num_rx_total += num_enq_slave;
> > > > +               }
> > > > +
> > > >                 if (unlikely(++idx == slave_count))
> > > >                         idx = 0;
> > > >         }
> > > >
> > > >         internals->active_slave = idx;
> > > > -       return num_rx_total;
> > > > +dequeue:
> > > > +       return rte_ring_dequeue_burst(internals->rx_ring, (void
> **)bufs,
> > > > +                                     nb_pkts, NULL);
> > > >  }
> > > >
> > > >  #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) ||
> > > > defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
> > > > @@ -3065,6 +3109,7 @@ bond_alloc(struct rte_vdev_device *dev,
> > > > uint8_t
> > > mode)
> > > >         struct bond_dev_private *internals = NULL;
> > > >         struct rte_eth_dev *eth_dev = NULL;
> > > >         uint32_t vlan_filter_bmp_size;
> > > > +       char mem_name[RTE_ETH_NAME_MAX_LEN];
> > > >
> > > >         /* now do all data allocation - for eth_dev structure, dummy
> > > > pci driver
> > > >          * and internal (private) data @@ -3129,6 +3174,19 @@
> > > > bond_alloc(struct rte_vdev_device *dev, uint8_t
> > > > mode)
> > > >         TAILQ_INIT(&internals->flow_list);
> > > >         internals->flow_isolated_valid = 0;
> > > >
> > > > +       snprintf(mem_name, RTE_DIM(mem_name), "bond_%u_rx",
> > > > internals->port_id);
> > > > +       internals->rx_ring = rte_ring_lookup(mem_name);
> > > > +       if (internals->rx_ring == NULL) {
> > > > +               internals->rx_ring = rte_ring_create(mem_name,
> > > > +
> > > > rte_align32pow2(PMD_BOND_RECV_RING_PKTS *
> > > > +
> rte_lcore_count()),
> > > > +                                    socket_id, 0);
> > > > +               if (internals->rx_ring == NULL)
> > > > +                       rte_panic("%s: Failed to create rx ring '%s':
> > > > %s\n", name,
> > > > +                                 mem_name, rte_strerror(rte_errno));
> > > > +       }
> > > > +
> > > > +
> > > >         /* Set mode 4 default configuration */
> > > >         bond_mode_8023ad_setup(eth_dev, NULL);
> > > >         if (bond_ethdev_mode_set(eth_dev, mode)) { diff --git
> > > > a/drivers/net/bonding/rte_eth_bond_private.h
> > > > b/drivers/net/bonding/rte_eth_bond_private.h
> > > > index 43e0e448df..80261c6b14 100644
> > > > --- a/drivers/net/bonding/rte_eth_bond_private.h
> > > > +++ b/drivers/net/bonding/rte_eth_bond_private.h
> > > > @@ -26,6 +26,8 @@
> > > >  #define PMD_BOND_LSC_POLL_PERIOD_KVARG
> > > ("lsc_poll_period_ms")
> > > >  #define PMD_BOND_LINK_UP_PROP_DELAY_KVARG      ("up_delay")
> > > >  #define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG
> > > ("down_delay")
> > > > +#define PMD_BOND_RECV_RING_PKTS                512
> > > > +#define PMD_BOND_RECV_PKTS_PER_SLAVE           32
> > > >
> > > >  #define PMD_BOND_XMIT_POLICY_LAYER2_KVARG      ("l2")
> > > >  #define PMD_BOND_XMIT_POLICY_LAYER23_KVARG     ("l23")
> > > > @@ -175,6 +177,8 @@ struct bond_dev_private {
> > > >
> > > >         void *vlan_filter_bmpmem;               /* enabled vlan
> filter
> > > > bitmap */
> > > >         struct rte_bitmap *vlan_filter_bmp;
> > > > +
> > > > +       struct rte_ring *rx_ring;
> > > >  };
> > > >
> > > >  extern const struct eth_dev_ops default_dev_ops;
> > > > --
> > > > 2.18.0
> > > >
> > > >
>
  
Matan Azrad Aug. 22, 2018, 7:09 a.m. UTC | #7
Hi Chas

From: Chas Williams
>On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad <mailto:matan@mellanox.com> wrote:
>Hi Chas
>
>From: Chas Williams
>> On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <mailto:matan@mellanox.com>
>> wrote:
>> Hi
>> 
>> From: Chas Williams
>> > This will need to be implemented for some of the other RX burst
>> > methods at some point for other modes to see this performance
>> > improvement (with the exception of active-backup).
>> 
>> Yes, I think it should be done at least to
>> bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
>> 
>> There is some duplicated code between the various RX paths.
>> I would like to eliminate that as much as possible, so I was going to give that
>> some thought first.
>
>There is no reason to stay this function as is while its twin is changed.
>
>Unfortunately, this is all the patch I have at this time.

>
>> 
>> 
>> > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <mailto:bluca@debian.org> wrote:
>> >
>> > > During bond 802.3ad receive, a burst of packets is fetched from each
>> > > slave into a local array and appended to per-slave ring buffer.
>> > > Packets are taken from the head of the ring buffer and returned to
>> > > the caller.  The number of mbufs provided to each slave is
>> > > sufficient to meet the requirements of the ixgbe vector receive.
>> 
>> Luca,
>> 
>> Can you explain these requirements of ixgbe?
>> 
>> The ixgbe (and some other Intel PMDs) have vectorized RX routines that are
>> more efficient (if not faster) taking advantage of some advanced CPU
>> instructions.  I think you need to be receiving at least 32 packets or more.
>
>So, why to do it in bond which is a generic driver for all the vendors PMDs,
>If for ixgbe and other Intel nics it is better you can force those PMDs to receive always 32 packets
>and to manage a ring by themselves.
>
>The drawback of the ring is some additional latency on the receive path.
>In testing, the additional latency hasn't been an issue for bonding.

When bonding does processing slower it may be a bottleneck for the packet processing for some application. 

> The bonding PMD has a fair bit of overhead associated with the RX and TX path
>calculations.  Most applications can just arrange to call the RX path
>with a sufficiently large receive.  Bonding can't do this.

I didn't talk on application I talked on the slave PMDs,
The slave PMD can manage a ring by itself if it helps for its own performance.
The bonding should not be oriented to specific PMDs.


>> Did you check for other vendor PMDs? It may hurt performance there..
>> 
>> I don't know, but I suspect probably not.  For the most part you are typically
>> reading almost up to the vector requirement.  But if one slave has just a
>> single packet, then you can't vectorize on the next slave.
>> 
>
>I don't think that the ring overhead is better for PMDs which are not using the vectorized instructions.
>
>The non-vectorized PMDs are usually quite slow.  The additional
>overhead doesn't make a difference in their performance.

We should not do things worse than they are.
  
Luca Boccassi Aug. 22, 2018, 10:19 a.m. UTC | #8
On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> Hi Chas
> 
> From: Chas Williams
> > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad <mailto:matan@mellanox
> > .com> wrote:
> > Hi Chas
> > 
> > From: Chas Williams
> > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <mailto:matan@mellano
> > > x.com>
> > > wrote:
> > > Hi
> > > 
> > > From: Chas Williams
> > > > This will need to be implemented for some of the other RX burst
> > > > methods at some point for other modes to see this performance
> > > > improvement (with the exception of active-backup).
> > > 
> > > Yes, I think it should be done at least to
> > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
> > > 
> > > There is some duplicated code between the various RX paths.
> > > I would like to eliminate that as much as possible, so I was
> > > going to give that
> > > some thought first.
> > 
> > There is no reason to stay this function as is while its twin is
> > changed.
> > 
> > Unfortunately, this is all the patch I have at this time.
> >  
> > 
> > > 
> > > 
> > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <mailto:bluca@deb
> > > > ian.org> wrote:
> > > > 
> > > > > During bond 802.3ad receive, a burst of packets is fetched
> > > > > from each
> > > > > slave into a local array and appended to per-slave ring
> > > > > buffer.
> > > > > Packets are taken from the head of the ring buffer and
> > > > > returned to
> > > > > the caller.  The number of mbufs provided to each slave is
> > > > > sufficient to meet the requirements of the ixgbe vector
> > > > > receive.
> > > 
> > > Luca,
> > > 
> > > Can you explain these requirements of ixgbe?
> > > 
> > > The ixgbe (and some other Intel PMDs) have vectorized RX routines
> > > that are
> > > more efficient (if not faster) taking advantage of some advanced
> > > CPU
> > > instructions.  I think you need to be receiving at least 32
> > > packets or more.
> > 
> > So, why to do it in bond which is a generic driver for all the
> > vendors PMDs,
> > If for ixgbe and other Intel nics it is better you can force those
> > PMDs to receive always 32 packets
> > and to manage a ring by themselves.
> > 
> > The drawback of the ring is some additional latency on the receive
> > path.
> > In testing, the additional latency hasn't been an issue for
> > bonding.
> 
> When bonding does processing slower it may be a bottleneck for the
> packet processing for some application. 
> 
> > The bonding PMD has a fair bit of overhead associated with the RX
> > and TX path
> > calculations.  Most applications can just arrange to call the RX
> > path
> > with a sufficiently large receive.  Bonding can't do this.
> 
> I didn't talk on application I talked on the slave PMDs,
> The slave PMD can manage a ring by itself if it helps for its own
> performance.
> The bonding should not be oriented to specific PMDs.

The issue though is that the performance problem is not with the
individual PMDs - it's with bonding. There were no reports regarding
the individual PMDs.
This comes from reports from customers from real world production
deployments - the issue of bonding being too slow was raised multiple
times. This patch addresses those issues, again in production
deployments, where it's been used for years, to users and customers
satisfaction.

So I'd like to share this improvement rather than keeping it private -
because I'm nice that way :-P

> > > Did you check for other vendor PMDs? It may hurt performance
> > > there..
> > > 
> > > I don't know, but I suspect probably not.  For the most part you
> > > are typically
> > > reading almost up to the vector requirement.  But if one slave
> > > has just a
> > > single packet, then you can't vectorize on the next slave.
> > > 
> > 
> > I don't think that the ring overhead is better for PMDs which are
> > not using the vectorized instructions.
> > 
> > The non-vectorized PMDs are usually quite slow.  The additional
> > overhead doesn't make a difference in their performance.
> 
> We should not do things worse than they are.

There were no reports that this made things worse. The feedback from
production was that it made things better.
  
Matan Azrad Aug. 22, 2018, 11:42 a.m. UTC | #9
Hi Luca

From: Luca Boccassi
> On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> > Hi Chas
> >
> > From: Chas Williams
> > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad <mailto:matan@mellanox
> > > .com> wrote:
> > > Hi Chas
> > >
> > > From: Chas Williams
> > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <mailto:matan@mellano
> > > > x.com>
> > > > wrote:
> > > > Hi
> > > >
> > > > From: Chas Williams
> > > > > This will need to be implemented for some of the other RX burst
> > > > > methods at some point for other modes to see this performance
> > > > > improvement (with the exception of active-backup).
> > > >
> > > > Yes, I think it should be done at least to
> > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
> > > >
> > > > There is some duplicated code between the various RX paths.
> > > > I would like to eliminate that as much as possible, so I was going
> > > > to give that some thought first.
> > >
> > > There is no reason to stay this function as is while its twin is
> > > changed.
> > >
> > > Unfortunately, this is all the patch I have at this time.
> > >
> > >
> > > >
> > > >
> > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <mailto:bluca@deb
> > > > > ian.org> wrote:
> > > > >
> > > > > > During bond 802.3ad receive, a burst of packets is fetched
> > > > > > from each slave into a local array and appended to per-slave
> > > > > > ring buffer.
> > > > > > Packets are taken from the head of the ring buffer and
> > > > > > returned to the caller.  The number of mbufs provided to each
> > > > > > slave is sufficient to meet the requirements of the ixgbe
> > > > > > vector receive.
> > > >
> > > > Luca,
> > > >
> > > > Can you explain these requirements of ixgbe?
> > > >
> > > > The ixgbe (and some other Intel PMDs) have vectorized RX routines
> > > > that are more efficient (if not faster) taking advantage of some
> > > > advanced CPU instructions.  I think you need to be receiving at
> > > > least 32 packets or more.
> > >
> > > So, why to do it in bond which is a generic driver for all the
> > > vendors PMDs, If for ixgbe and other Intel nics it is better you can
> > > force those PMDs to receive always 32 packets and to manage a ring
> > > by themselves.
> > >
> > > The drawback of the ring is some additional latency on the receive
> > > path.
> > > In testing, the additional latency hasn't been an issue for bonding.
> >
> > When bonding does processing slower it may be a bottleneck for the
> > packet processing for some application.
> >
> > > The bonding PMD has a fair bit of overhead associated with the RX
> > > and TX path calculations.  Most applications can just arrange to
> > > call the RX path with a sufficiently large receive.  Bonding can't
> > > do this.
> >
> > I didn't talk on application I talked on the slave PMDs, The slave PMD
> > can manage a ring by itself if it helps for its own performance.
> > The bonding should not be oriented to specific PMDs.
> 
> The issue though is that the performance problem is not with the individual
> PMDs - it's with bonding. There were no reports regarding the individual
> PMDs.
> This comes from reports from customers from real world production
> deployments - the issue of bonding being too slow was raised multiple times.
> This patch addresses those issues, again in production deployments, where
> it's been used for years, to users and customers satisfaction.

From Chas I understood that using burst of 32 helps for some slave PMDs performance which makes sense.
I can't understand how the extra copy phases improves the bonding itself performance:

You added 2 copy phases in the bonding RX function:
1. Get packets from the slave to a local array.
2. Copy packet pointers from a local array to the ring array.
3. Copy packet pointers from the ring array to the application array.

Each packet arriving to the application must pass the above 3 phases(in a specific call or in previous calls).

Without this patch we have only -
Get packets from the slave to the application array.

Can you explain how the extra copies improves the bonding performance?

Looks like it improves the slaves PMDs and because of that the bonding PMD performance becomes better.

> So I'd like to share this improvement rather than keeping it private - because
> I'm nice that way :-P
> 
> > > > Did you check for other vendor PMDs? It may hurt performance
> > > > there..
> > > >
> > > > I don't know, but I suspect probably not.  For the most part you
> > > > are typically reading almost up to the vector requirement.  But if
> > > > one slave has just a single packet, then you can't vectorize on
> > > > the next slave.
> > > >
> > >
> > > I don't think that the ring overhead is better for PMDs which are
> > > not using the vectorized instructions.
> > >
> > > The non-vectorized PMDs are usually quite slow.  The additional
> > > overhead doesn't make a difference in their performance.
> >
> > We should not do things worse than they are.
> 
> There were no reports that this made things worse. The feedback from
> production was that it made things better.

Yes, It may be good for specific slaves drivers but hurt another slaves drivers,
So maybe it should stay private to specific costumers using specific nics.

Again, I can understand how this patch improves performance of some PMDs
therefore I think the bonding is not the place to add it but maybe some PMDs.
  
Eric Kinzie Aug. 22, 2018, 5:43 p.m. UTC | #10
On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
> Hi Luca
> 
> From: Luca Boccassi
> > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> > > Hi Chas
> > >
> > > From: Chas Williams
> > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad <mailto:matan@mellanox
> > > > .com> wrote:
> > > > Hi Chas
> > > >
> > > > From: Chas Williams
> > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad <mailto:matan@mellano
> > > > > x.com>
> > > > > wrote:
> > > > > Hi
> > > > >
> > > > > From: Chas Williams
> > > > > > This will need to be implemented for some of the other RX burst
> > > > > > methods at some point for other modes to see this performance
> > > > > > improvement (with the exception of active-backup).
> > > > >
> > > > > Yes, I think it should be done at least to
> > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for now.
> > > > >
> > > > > There is some duplicated code between the various RX paths.
> > > > > I would like to eliminate that as much as possible, so I was going
> > > > > to give that some thought first.
> > > >
> > > > There is no reason to stay this function as is while its twin is
> > > > changed.
> > > >
> > > > Unfortunately, this is all the patch I have at this time.
> > > >
> > > >
> > > > >
> > > > >
> > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi <mailto:bluca@deb
> > > > > > ian.org> wrote:
> > > > > >
> > > > > > > During bond 802.3ad receive, a burst of packets is fetched
> > > > > > > from each slave into a local array and appended to per-slave
> > > > > > > ring buffer.
> > > > > > > Packets are taken from the head of the ring buffer and
> > > > > > > returned to the caller.  The number of mbufs provided to each
> > > > > > > slave is sufficient to meet the requirements of the ixgbe
> > > > > > > vector receive.
> > > > >
> > > > > Luca,
> > > > >
> > > > > Can you explain these requirements of ixgbe?
> > > > >
> > > > > The ixgbe (and some other Intel PMDs) have vectorized RX routines
> > > > > that are more efficient (if not faster) taking advantage of some
> > > > > advanced CPU instructions.  I think you need to be receiving at
> > > > > least 32 packets or more.
> > > >
> > > > So, why to do it in bond which is a generic driver for all the
> > > > vendors PMDs, If for ixgbe and other Intel nics it is better you can
> > > > force those PMDs to receive always 32 packets and to manage a ring
> > > > by themselves.
> > > >
> > > > The drawback of the ring is some additional latency on the receive
> > > > path.
> > > > In testing, the additional latency hasn't been an issue for bonding.
> > >
> > > When bonding does processing slower it may be a bottleneck for the
> > > packet processing for some application.
> > >
> > > > The bonding PMD has a fair bit of overhead associated with the RX
> > > > and TX path calculations.  Most applications can just arrange to
> > > > call the RX path with a sufficiently large receive.  Bonding can't
> > > > do this.
> > >
> > > I didn't talk on application I talked on the slave PMDs, The slave PMD
> > > can manage a ring by itself if it helps for its own performance.
> > > The bonding should not be oriented to specific PMDs.
> > 
> > The issue though is that the performance problem is not with the individual
> > PMDs - it's with bonding. There were no reports regarding the individual
> > PMDs.
> > This comes from reports from customers from real world production
> > deployments - the issue of bonding being too slow was raised multiple times.
> > This patch addresses those issues, again in production deployments, where
> > it's been used for years, to users and customers satisfaction.
> 
> From Chas I understood that using burst of 32 helps for some slave PMDs performance which makes sense.
> I can't understand how the extra copy phases improves the bonding itself performance:
> 
> You added 2 copy phases in the bonding RX function:
> 1. Get packets from the slave to a local array.
> 2. Copy packet pointers from a local array to the ring array.
> 3. Copy packet pointers from the ring array to the application array.
> 
> Each packet arriving to the application must pass the above 3 phases(in a specific call or in previous calls).
> 
> Without this patch we have only -
> Get packets from the slave to the application array.
> 
> Can you explain how the extra copies improves the bonding performance?
> 
> Looks like it improves the slaves PMDs and because of that the bonding PMD performance becomes better.

I'm not sure that adding more buffer management to the vector PMDs will
improve the drivers' performance; it's just that calling the rx function
in such a way that it returns no data wastes time.  The bonding driver
is already an exercise in buffer management so adding this layer of
indirection here makes sense in my opinion, as does hiding the details
of the consituent interfaces where possible.


> > So I'd like to share this improvement rather than keeping it private - because
> > I'm nice that way :-P
> > 
> > > > > Did you check for other vendor PMDs? It may hurt performance
> > > > > there..
> > > > >
> > > > > I don't know, but I suspect probably not.  For the most part you
> > > > > are typically reading almost up to the vector requirement.  But if
> > > > > one slave has just a single packet, then you can't vectorize on
> > > > > the next slave.
> > > > >
> > > >
> > > > I don't think that the ring overhead is better for PMDs which are
> > > > not using the vectorized instructions.
> > > >
> > > > The non-vectorized PMDs are usually quite slow.  The additional
> > > > overhead doesn't make a difference in their performance.
> > >
> > > We should not do things worse than they are.
> > 
> > There were no reports that this made things worse. The feedback from
> > production was that it made things better.
> 
> Yes, It may be good for specific slaves drivers but hurt another slaves drivers,
> So maybe it should stay private to specific costumers using specific nics.
> 
> Again, I can understand how this patch improves performance of some PMDs
> therefore I think the bonding is not the place to add it but maybe some PMDs.
  
Matan Azrad Aug. 23, 2018, 7:28 a.m. UTC | #11
Hi

From: Eric Kinzie
> On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
> > Hi Luca
> >
> > From: Luca Boccassi
> > > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> > > > Hi Chas
> > > >
> > > > From: Chas Williams
> > > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad
> > > > > <mailto:matan@mellanox .com> wrote:
> > > > > Hi Chas
> > > > >
> > > > > From: Chas Williams
> > > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad
> > > > > > <mailto:matan@mellano x.com>
> > > > > > wrote:
> > > > > > Hi
> > > > > >
> > > > > > From: Chas Williams
> > > > > > > This will need to be implemented for some of the other RX
> > > > > > > burst methods at some point for other modes to see this
> > > > > > > performance improvement (with the exception of active-backup).
> > > > > >
> > > > > > Yes, I think it should be done at least to
> > > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for
> now.
> > > > > >
> > > > > > There is some duplicated code between the various RX paths.
> > > > > > I would like to eliminate that as much as possible, so I was
> > > > > > going to give that some thought first.
> > > > >
> > > > > There is no reason to stay this function as is while its twin is
> > > > > changed.
> > > > >
> > > > > Unfortunately, this is all the patch I have at this time.
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi
> > > > > > > <mailto:bluca@deb ian.org> wrote:
> > > > > > >
> > > > > > > > During bond 802.3ad receive, a burst of packets is fetched
> > > > > > > > from each slave into a local array and appended to
> > > > > > > > per-slave ring buffer.
> > > > > > > > Packets are taken from the head of the ring buffer and
> > > > > > > > returned to the caller.  The number of mbufs provided to
> > > > > > > > each slave is sufficient to meet the requirements of the
> > > > > > > > ixgbe vector receive.
> > > > > >
> > > > > > Luca,
> > > > > >
> > > > > > Can you explain these requirements of ixgbe?
> > > > > >
> > > > > > The ixgbe (and some other Intel PMDs) have vectorized RX
> > > > > > routines that are more efficient (if not faster) taking
> > > > > > advantage of some advanced CPU instructions.  I think you need
> > > > > > to be receiving at least 32 packets or more.
> > > > >
> > > > > So, why to do it in bond which is a generic driver for all the
> > > > > vendors PMDs, If for ixgbe and other Intel nics it is better you
> > > > > can force those PMDs to receive always 32 packets and to manage
> > > > > a ring by themselves.
> > > > >
> > > > > The drawback of the ring is some additional latency on the
> > > > > receive path.
> > > > > In testing, the additional latency hasn't been an issue for bonding.
> > > >
> > > > When bonding does processing slower it may be a bottleneck for the
> > > > packet processing for some application.
> > > >
> > > > > The bonding PMD has a fair bit of overhead associated with the
> > > > > RX and TX path calculations.  Most applications can just arrange
> > > > > to call the RX path with a sufficiently large receive.  Bonding
> > > > > can't do this.
> > > >
> > > > I didn't talk on application I talked on the slave PMDs, The slave
> > > > PMD can manage a ring by itself if it helps for its own performance.
> > > > The bonding should not be oriented to specific PMDs.
> > >
> > > The issue though is that the performance problem is not with the
> > > individual PMDs - it's with bonding. There were no reports regarding
> > > the individual PMDs.
> > > This comes from reports from customers from real world production
> > > deployments - the issue of bonding being too slow was raised multiple
> times.
> > > This patch addresses those issues, again in production deployments,
> > > where it's been used for years, to users and customers satisfaction.
> >
> > From Chas I understood that using burst of 32 helps for some slave PMDs
> performance which makes sense.
> > I can't understand how the extra copy phases improves the bonding itself
> performance:
> >
> > You added 2 copy phases in the bonding RX function:
> > 1. Get packets from the slave to a local array.
> > 2. Copy packet pointers from a local array to the ring array.
> > 3. Copy packet pointers from the ring array to the application array.
> >
> > Each packet arriving to the application must pass the above 3 phases(in a
> specific call or in previous calls).
> >
> > Without this patch we have only -
> > Get packets from the slave to the application array.
> >
> > Can you explain how the extra copies improves the bonding performance?
> >
> > Looks like it improves the slaves PMDs and because of that the bonding
> PMD performance becomes better.
> 
> I'm not sure that adding more buffer management to the vector PMDs will
> improve the drivers' performance; it's just that calling the rx function in such
> a way that it returns no data wastes time.

Sorry, I don't fully understand what you said here, please rephrase.

>  The bonding driver is already an exercise in buffer management so adding this layer of indirection here makes
> sense in my opinion, as does hiding the details of the consituent interfaces where possible.

Can you explain how this new buffer management with the extra pointer copies improves the bonding itself performance?
Looks really strange to me.

> > > So I'd like to share this improvement rather than keeping it private
> > > - because I'm nice that way :-P
> > >
> > > > > > Did you check for other vendor PMDs? It may hurt performance
> > > > > > there..
> > > > > >
> > > > > > I don't know, but I suspect probably not.  For the most part
> > > > > > you are typically reading almost up to the vector requirement.
> > > > > > But if one slave has just a single packet, then you can't
> > > > > > vectorize on the next slave.
> > > > > >
> > > > >
> > > > > I don't think that the ring overhead is better for PMDs which
> > > > > are not using the vectorized instructions.
> > > > >
> > > > > The non-vectorized PMDs are usually quite slow.  The additional
> > > > > overhead doesn't make a difference in their performance.
> > > >
> > > > We should not do things worse than they are.
> > >
> > > There were no reports that this made things worse. The feedback from
> > > production was that it made things better.
> >
> > Yes, It may be good for specific slaves drivers but hurt another
> > slaves drivers, So maybe it should stay private to specific costumers using
> specific nics.
> >
> > Again, I can understand how this patch improves performance of some
> > PMDs therefore I think the bonding is not the place to add it but maybe
> some PMDs.
  
Chas Williams Aug. 23, 2018, 3:51 p.m. UTC | #12
On Thu, Aug 23, 2018 at 3:28 AM Matan Azrad <matan@mellanox.com> wrote:

> Hi
>
> From: Eric Kinzie
> > On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
> > > Hi Luca
> > >
> > > From: Luca Boccassi
> > > > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> > > > > Hi Chas
> > > > >
> > > > > From: Chas Williams
> > > > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad
> > > > > > <mailto:matan@mellanox .com> wrote:
> > > > > > Hi Chas
> > > > > >
> > > > > > From: Chas Williams
> > > > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad
> > > > > > > <mailto:matan@mellano x.com>
> > > > > > > wrote:
> > > > > > > Hi
> > > > > > >
> > > > > > > From: Chas Williams
> > > > > > > > This will need to be implemented for some of the other RX
> > > > > > > > burst methods at some point for other modes to see this
> > > > > > > > performance improvement (with the exception of
> active-backup).
> > > > > > >
> > > > > > > Yes, I think it should be done at least to
> > > > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for
> > now.
> > > > > > >
> > > > > > > There is some duplicated code between the various RX paths.
> > > > > > > I would like to eliminate that as much as possible, so I was
> > > > > > > going to give that some thought first.
> > > > > >
> > > > > > There is no reason to stay this function as is while its twin is
> > > > > > changed.
> > > > > >
> > > > > > Unfortunately, this is all the patch I have at this time.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi
> > > > > > > > <mailto:bluca@deb ian.org> wrote:
> > > > > > > >
> > > > > > > > > During bond 802.3ad receive, a burst of packets is fetched
> > > > > > > > > from each slave into a local array and appended to
> > > > > > > > > per-slave ring buffer.
> > > > > > > > > Packets are taken from the head of the ring buffer and
> > > > > > > > > returned to the caller.  The number of mbufs provided to
> > > > > > > > > each slave is sufficient to meet the requirements of the
> > > > > > > > > ixgbe vector receive.
> > > > > > >
> > > > > > > Luca,
> > > > > > >
> > > > > > > Can you explain these requirements of ixgbe?
> > > > > > >
> > > > > > > The ixgbe (and some other Intel PMDs) have vectorized RX
> > > > > > > routines that are more efficient (if not faster) taking
> > > > > > > advantage of some advanced CPU instructions.  I think you need
> > > > > > > to be receiving at least 32 packets or more.
> > > > > >
> > > > > > So, why to do it in bond which is a generic driver for all the
> > > > > > vendors PMDs, If for ixgbe and other Intel nics it is better you
> > > > > > can force those PMDs to receive always 32 packets and to manage
> > > > > > a ring by themselves.
> > > > > >
> > > > > > The drawback of the ring is some additional latency on the
> > > > > > receive path.
> > > > > > In testing, the additional latency hasn't been an issue for
> bonding.
> > > > >
> > > > > When bonding does processing slower it may be a bottleneck for the
> > > > > packet processing for some application.
> > > > >
> > > > > > The bonding PMD has a fair bit of overhead associated with the
> > > > > > RX and TX path calculations.  Most applications can just arrange
> > > > > > to call the RX path with a sufficiently large receive.  Bonding
> > > > > > can't do this.
> > > > >
> > > > > I didn't talk on application I talked on the slave PMDs, The slave
> > > > > PMD can manage a ring by itself if it helps for its own
> performance.
> > > > > The bonding should not be oriented to specific PMDs.
> > > >
> > > > The issue though is that the performance problem is not with the
> > > > individual PMDs - it's with bonding. There were no reports regarding
> > > > the individual PMDs.
> > > > This comes from reports from customers from real world production
> > > > deployments - the issue of bonding being too slow was raised multiple
> > times.
> > > > This patch addresses those issues, again in production deployments,
> > > > where it's been used for years, to users and customers satisfaction.
> > >
> > > From Chas I understood that using burst of 32 helps for some slave PMDs
> > performance which makes sense.
> > > I can't understand how the extra copy phases improves the bonding
> itself
> > performance:
> > >
> > > You added 2 copy phases in the bonding RX function:
> > > 1. Get packets from the slave to a local array.
> > > 2. Copy packet pointers from a local array to the ring array.
> > > 3. Copy packet pointers from the ring array to the application array.
> > >
> > > Each packet arriving to the application must pass the above 3
> phases(in a
> > specific call or in previous calls).
> > >
> > > Without this patch we have only -
> > > Get packets from the slave to the application array.
> > >
> > > Can you explain how the extra copies improves the bonding performance?
> > >
> > > Looks like it improves the slaves PMDs and because of that the bonding
> > PMD performance becomes better.
> >
> > I'm not sure that adding more buffer management to the vector PMDs will
> > improve the drivers' performance; it's just that calling the rx function
> in such
> > a way that it returns no data wastes time.
>
> Sorry, I don't fully understand what you said here, please rephrase.
>
> >  The bonding driver is already an exercise in buffer management so
> adding this layer of indirection here makes
> > sense in my opinion, as does hiding the details of the consituent
> interfaces where possible.
>
> Can you explain how this new buffer management with the extra pointer
> copies improves the bonding itself performance?
> Looks really strange to me.
>

Because rings are generally quite efficient.

Bonding is in a middle ground between application and PMD.  What bonding
is doing, may not improve all applications.  If using a ring to buffer
the vectorized receive routines, improves your particular application,
that's great.  However, I don't think I can say that it would help all
applications.  As you point out, there is overhead associated with
a ring.

Bonding's receive burst isn't especially efficient (in mode 4).  Bonding
benefits from being able to read as much as possible (within limits of
course, large reads would blow out caches) from each slave.  It can't
return all that data though because applications tend to use the
burst size that would be efficient for a typical PMD.  An alternative
might be to ask bonding applications to simply issue larger reads for
certain modes.  That's probably not as easy as it sounds given the
way that the burst length effects multiplexing.

Another solution might be just alternatively poll the individual
slaves on each rx burst.  But that means you need to poll at a
faster rate.  Depending on your application, you might not be
able to do that.  We can avoid this scheduling overhead by just
doing the extra reads in bonding and buffering in a ring.

Since bonding is going to be buffering everything in a ring, it makes
sense to just read as much as is as efficiently possible.  For the
Intel adapters, this means using a read big enough trigger the vectorized
versions.



>
> > > > So I'd like to share this improvement rather than keeping it private
> > > > - because I'm nice that way :-P
> > > >
> > > > > > > Did you check for other vendor PMDs? It may hurt performance
> > > > > > > there..
> > > > > > >
> > > > > > > I don't know, but I suspect probably not.  For the most part
> > > > > > > you are typically reading almost up to the vector requirement.
> > > > > > > But if one slave has just a single packet, then you can't
> > > > > > > vectorize on the next slave.
> > > > > > >
> > > > > >
> > > > > > I don't think that the ring overhead is better for PMDs which
> > > > > > are not using the vectorized instructions.
> > > > > >
> > > > > > The non-vectorized PMDs are usually quite slow.  The additional
> > > > > > overhead doesn't make a difference in their performance.
> > > > >
> > > > > We should not do things worse than they are.
> > > >
> > > > There were no reports that this made things worse. The feedback from
> > > > production was that it made things better.
> > >
> > > Yes, It may be good for specific slaves drivers but hurt another
> > > slaves drivers, So maybe it should stay private to specific costumers
> using
> > specific nics.
> > >
> > > Again, I can understand how this patch improves performance of some
> > > PMDs therefore I think the bonding is not the place to add it but maybe
> > some PMDs.
>
  
Matan Azrad Aug. 26, 2018, 7:40 a.m. UTC | #13
From: Chas Williams <3chas3@gmail.com> 
>On Thu, Aug 23, 2018 at 3:28 AM Matan Azrad <mailto:matan@mellanox.com> wrote:
>Hi
>
>From: Eric Kinzie
>> On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
>> > Hi Luca
>> >
>> > From: Luca Boccassi
>> > > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
>> > > > Hi Chas
>> > > >
>> > > > From: Chas Williams
>> > > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad
>> > > > > <mailto:mailto:matan@mellanox .com> wrote:
>> > > > > Hi Chas
>> > > > >
>> > > > > From: Chas Williams
>> > > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad
>> > > > > > <mailto:mailto:matan@mellano https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fx.com&data=02%7C01%7Cmatan%40mellanox.com%7Cc662ec1ee7734d12025808d609104474%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636706362822778011&sdata=MNAx1E5TgOrzXO9N8SWCOojrWmbqD8DPND%2BCXorOYhQ%3D&reserved=0>
>> > > > > > wrote:
>> > > > > > Hi
>> > > > > >
>> > > > > > From: Chas Williams
>> > > > > > > This will need to be implemented for some of the other RX
>> > > > > > > burst methods at some point for other modes to see this
>> > > > > > > performance improvement (with the exception of active-backup).
>> > > > > >
>> > > > > > Yes, I think it should be done at least to
>> > > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for
>> now.
>> > > > > >
>> > > > > > There is some duplicated code between the various RX paths.
>> > > > > > I would like to eliminate that as much as possible, so I was
>> > > > > > going to give that some thought first.
>> > > > >
>> > > > > There is no reason to stay this function as is while its twin is
>> > > > > changed.
>> > > > >
>> > > > > Unfortunately, this is all the patch I have at this time.
>> > > > >
>> > > > >
>> > > > > >
>> > > > > >
>> > > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi
>> > > > > > > <mailto:mailto:bluca@deb https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fian.org&data=02%7C01%7Cmatan%40mellanox.com%7Cc662ec1ee7734d12025808d609104474%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636706362822778011&sdata=AAcm%2FcbAA4CQsOnXFZWIqii6T%2BFUcc8xxT7%2Fs3tKIfY%3D&reserved=0> wrote:
>> > > > > > >
>> > > > > > > > During bond 802.3ad receive, a burst of packets is fetched
>> > > > > > > > from each slave into a local array and appended to
>> > > > > > > > per-slave ring buffer.
>> > > > > > > > Packets are taken from the head of the ring buffer and
>> > > > > > > > returned to the caller.  The number of mbufs provided to
>> > > > > > > > each slave is sufficient to meet the requirements of the
>> > > > > > > > ixgbe vector receive.
>> > > > > >
>> > > > > > Luca,
>> > > > > >
>> > > > > > Can you explain these requirements of ixgbe?
>> > > > > >
>> > > > > > The ixgbe (and some other Intel PMDs) have vectorized RX
>> > > > > > routines that are more efficient (if not faster) taking
>> > > > > > advantage of some advanced CPU instructions.  I think you need
>> > > > > > to be receiving at least 32 packets or more.
>> > > > >
>> > > > > So, why to do it in bond which is a generic driver for all the
>> > > > > vendors PMDs, If for ixgbe and other Intel nics it is better you
>> > > > > can force those PMDs to receive always 32 packets and to manage
>> > > > > a ring by themselves.
>> > > > >
>> > > > > The drawback of the ring is some additional latency on the
>> > > > > receive path.
>> > > > > In testing, the additional latency hasn't been an issue for bonding.
>> > > >
>> > > > When bonding does processing slower it may be a bottleneck for the
>> > > > packet processing for some application.
>> > > >
>> > > > > The bonding PMD has a fair bit of overhead associated with the
>> > > > > RX and TX path calculations.  Most applications can just arrange
>> > > > > to call the RX path with a sufficiently large receive.  Bonding
>> > > > > can't do this.
>> > > >
>> > > > I didn't talk on application I talked on the slave PMDs, The slave
>> > > > PMD can manage a ring by itself if it helps for its own performance.
>> > > > The bonding should not be oriented to specific PMDs.
>> > >
>> > > The issue though is that the performance problem is not with the
>> > > individual PMDs - it's with bonding. There were no reports regarding
>> > > the individual PMDs.
>> > > This comes from reports from customers from real world production
>> > > deployments - the issue of bonding being too slow was raised multiple
>> times.
>> > > This patch addresses those issues, again in production deployments,
>> > > where it's been used for years, to users and customers satisfaction.
>> >
>> > From Chas I understood that using burst of 32 helps for some slave PMDs
>> performance which makes sense.
>> > I can't understand how the extra copy phases improves the bonding itself
>> performance:
>> >
>> > You added 2 copy phases in the bonding RX function:
>> > 1. Get packets from the slave to a local array.
>> > 2. Copy packet pointers from a local array to the ring array.
>> > 3. Copy packet pointers from the ring array to the application array.
>> >
>> > Each packet arriving to the application must pass the above 3 phases(in a
>> specific call or in previous calls).
>> >
>> > Without this patch we have only -
>> > Get packets from the slave to the application array.
>> >
>> > Can you explain how the extra copies improves the bonding performance?
>> >
>> > Looks like it improves the slaves PMDs and because of that the bonding
>> PMD performance becomes better.
>> 
>> I'm not sure that adding more buffer management to the vector PMDs will
>> improve the drivers' performance; it's just that calling the rx function in such
>> a way that it returns no data wastes time.
>
>Sorry, I don't fully understand what you said here, please rephrase.
>
>>  The bonding driver is already an exercise in buffer management so adding this layer of indirection here makes
>> sense in my opinion, as does hiding the details of the consituent interfaces where possible.
>
>Can you explain how this new buffer management with the extra pointer copies improves the bonding itself performance?
>Looks really strange to me.
>
>Because rings are generally quite efficient.

But you are using a ring in addition to regular array management, it must hurt performance of the bonding PMD
(means the bonding itself - not the slaves PMDs which are called from the bonding)

>
>Bonding is in a middle ground between application and PMD.
Yes.
>What bonding is doing, may not improve all applications.
Yes, but it can be solved using some bonding modes.
> If using a ring to buffer the vectorized receive routines, improves your particular application,
>that's great. 
It may be not great and even bad for some other PMDs which are not vectororized.

> However, I don't think I can say that it would help all
>applications.  As you point out, there is overhead associated with
>a ring.
Yes.
>Bonding's receive burst isn't especially efficient (in mode 4).

Why?

> Bonding benefits from being able to read as much as possible (within limits of
>course, large reads would blow out caches) from each slave.

The slaves PMDs can benefits in the same way.

>It can't return all that data though because applications tend to use the 
>burst size that would be efficient for a typical PMD.

What is the preferred burst size of the bonding? Maybe the application should use it when they are using bonding.
 
>An alternative might be to ask bonding applications to simply issue larger reads for
>certain modes.  That's probably not as easy as it sounds given the
>way that the burst length effects multiplexing.

Can you explain it more?

>Another solution might be just alternatively poll the individual
>slaves on each rx burst.  But that means you need to poll at a
>faster rate.  Depending on your application, you might not be
>able to do that.

Again, can you be more precise in the above explanation?

>  We can avoid this scheduling overhead by just
>doing the extra reads in bonding and buffering in a ring.
>
>Since bonding is going to be buffering everything in a ring,

? I don't familiar with it. For now I don't think we need a ring.

 >it makes sense to just read as much as is as efficiently possible.  For the
>Intel adapters, this means using a read big enough trigger the vectorized
>versions.
>> > > So I'd like to share this improvement rather than keeping it private
>> > > - because I'm nice that way :-P
>> > >
>> > > > > > Did you check for other vendor PMDs? It may hurt performance
>> > > > > > there..
>> > > > > >
>> > > > > > I don't know, but I suspect probably not.  For the most part
>> > > > > > you are typically reading almost up to the vector requirement.
>> > > > > > But if one slave has just a single packet, then you can't
>> > > > > > vectorize on the next slave.
>> > > > > >
>> > > > >
>> > > > > I don't think that the ring overhead is better for PMDs which
>> > > > > are not using the vectorized instructions.
>> > > > >
>> > > > > The non-vectorized PMDs are usually quite slow.  The additional
>> > > > > overhead doesn't make a difference in their performance.
>> > > >
>> > > > We should not do things worse than they are.
>> > >
>> > > There were no reports that this made things worse. The feedback from
>> > > production was that it made things better.
>> >
>> > Yes, It may be good for specific slaves drivers but hurt another
>> > slaves drivers, So maybe it should stay private to specific costumers using
>> specific nics.
>> >
>> > Again, I can understand how this patch improves performance of some
>> > PMDs therefore I think the bonding is not the place to add it but maybe
>> some PMDs.
>
  
Chas Williams Aug. 27, 2018, 1:22 p.m. UTC | #14
On Sun, Aug 26, 2018 at 3:40 AM Matan Azrad <matan@mellanox.com> wrote:

>
> From: Chas Williams <3chas3@gmail.com>
> >On Thu, Aug 23, 2018 at 3:28 AM Matan Azrad <mailto:matan@mellanox.com>
> wrote:
> >Hi
> >
> >From: Eric Kinzie
> >> On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
> >> > Hi Luca
> >> >
> >> > From: Luca Boccassi
> >> > > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> >> > > > Hi Chas
> >> > > >
> >> > > > From: Chas Williams
> >> > > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad
> >> > > > > <mailto:mailto:matan@mellanox .com> wrote:
> >> > > > > Hi Chas
> >> > > > >
> >> > > > > From: Chas Williams
> >> > > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad
> >> > > > > > <mailto:mailto:matan@mellano
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fx.com&data=02%7C01%7Cmatan%40mellanox.com%7Cc662ec1ee7734d12025808d609104474%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636706362822778011&sdata=MNAx1E5TgOrzXO9N8SWCOojrWmbqD8DPND%2BCXorOYhQ%3D&reserved=0
> >
> >> > > > > > wrote:
> >> > > > > > Hi
> >> > > > > >
> >> > > > > > From: Chas Williams
> >> > > > > > > This will need to be implemented for some of the other RX
> >> > > > > > > burst methods at some point for other modes to see this
> >> > > > > > > performance improvement (with the exception of
> active-backup).
> >> > > > > >
> >> > > > > > Yes, I think it should be done at least to
> >> > > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for
> >> now.
> >> > > > > >
> >> > > > > > There is some duplicated code between the various RX paths.
> >> > > > > > I would like to eliminate that as much as possible, so I was
> >> > > > > > going to give that some thought first.
> >> > > > >
> >> > > > > There is no reason to stay this function as is while its twin is
> >> > > > > changed.
> >> > > > >
> >> > > > > Unfortunately, this is all the patch I have at this time.
> >> > > > >
> >> > > > >
> >> > > > > >
> >> > > > > >
> >> > > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi
> >> > > > > > > <mailto:mailto:bluca@deb
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fian.org&data=02%7C01%7Cmatan%40mellanox.com%7Cc662ec1ee7734d12025808d609104474%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636706362822778011&sdata=AAcm%2FcbAA4CQsOnXFZWIqii6T%2BFUcc8xxT7%2Fs3tKIfY%3D&reserved=0>
> wrote:
> >> > > > > > >
> >> > > > > > > > During bond 802.3ad receive, a burst of packets is fetched
> >> > > > > > > > from each slave into a local array and appended to
> >> > > > > > > > per-slave ring buffer.
> >> > > > > > > > Packets are taken from the head of the ring buffer and
> >> > > > > > > > returned to the caller.  The number of mbufs provided to
> >> > > > > > > > each slave is sufficient to meet the requirements of the
> >> > > > > > > > ixgbe vector receive.
> >> > > > > >
> >> > > > > > Luca,
> >> > > > > >
> >> > > > > > Can you explain these requirements of ixgbe?
> >> > > > > >
> >> > > > > > The ixgbe (and some other Intel PMDs) have vectorized RX
> >> > > > > > routines that are more efficient (if not faster) taking
> >> > > > > > advantage of some advanced CPU instructions.  I think you need
> >> > > > > > to be receiving at least 32 packets or more.
> >> > > > >
> >> > > > > So, why to do it in bond which is a generic driver for all the
> >> > > > > vendors PMDs, If for ixgbe and other Intel nics it is better you
> >> > > > > can force those PMDs to receive always 32 packets and to manage
> >> > > > > a ring by themselves.
> >> > > > >
> >> > > > > The drawback of the ring is some additional latency on the
> >> > > > > receive path.
> >> > > > > In testing, the additional latency hasn't been an issue for
> bonding.
> >> > > >
> >> > > > When bonding does processing slower it may be a bottleneck for the
> >> > > > packet processing for some application.
> >> > > >
> >> > > > > The bonding PMD has a fair bit of overhead associated with the
> >> > > > > RX and TX path calculations.  Most applications can just arrange
> >> > > > > to call the RX path with a sufficiently large receive.  Bonding
> >> > > > > can't do this.
> >> > > >
> >> > > > I didn't talk on application I talked on the slave PMDs, The slave
> >> > > > PMD can manage a ring by itself if it helps for its own
> performance.
> >> > > > The bonding should not be oriented to specific PMDs.
> >> > >
> >> > > The issue though is that the performance problem is not with the
> >> > > individual PMDs - it's with bonding. There were no reports regarding
> >> > > the individual PMDs.
> >> > > This comes from reports from customers from real world production
> >> > > deployments - the issue of bonding being too slow was raised
> multiple
> >> times.
> >> > > This patch addresses those issues, again in production deployments,
> >> > > where it's been used for years, to users and customers satisfaction.
> >> >
> >> > From Chas I understood that using burst of 32 helps for some slave
> PMDs
> >> performance which makes sense.
> >> > I can't understand how the extra copy phases improves the bonding
> itself
> >> performance:
> >> >
> >> > You added 2 copy phases in the bonding RX function:
> >> > 1. Get packets from the slave to a local array.
> >> > 2. Copy packet pointers from a local array to the ring array.
> >> > 3. Copy packet pointers from the ring array to the application array.
> >> >
> >> > Each packet arriving to the application must pass the above 3
> phases(in a
> >> specific call or in previous calls).
> >> >
> >> > Without this patch we have only -
> >> > Get packets from the slave to the application array.
> >> >
> >> > Can you explain how the extra copies improves the bonding performance?
> >> >
> >> > Looks like it improves the slaves PMDs and because of that the bonding
> >> PMD performance becomes better.
> >>
> >> I'm not sure that adding more buffer management to the vector PMDs will
> >> improve the drivers' performance; it's just that calling the rx
> function in such
> >> a way that it returns no data wastes time.
> >
> >Sorry, I don't fully understand what you said here, please rephrase.
> >
> >>  The bonding driver is already an exercise in buffer management so
> adding this layer of indirection here makes
> >> sense in my opinion, as does hiding the details of the consituent
> interfaces where possible.
> >
> >Can you explain how this new buffer management with the extra pointer
> copies improves the bonding itself performance?
> >Looks really strange to me.
> >
> >Because rings are generally quite efficient.
>
> But you are using a ring in addition to regular array management, it must
> hurt performance of the bonding PMD
> (means the bonding itself - not the slaves PMDs which are called from the
> bonding)
>
> >
> >Bonding is in a middle ground between application and PMD.
> Yes.
> >What bonding is doing, may not improve all applications.
> Yes, but it can be solved using some bonding modes.
> > If using a ring to buffer the vectorized receive routines, improves your
> particular application,
> >that's great.
> It may be not great and even bad for some other PMDs which are not
> vectororized.
>
> > However, I don't think I can say that it would help all
> >applications.  As you point out, there is overhead associated with
> >a ring.
> Yes.
> >Bonding's receive burst isn't especially efficient (in mode 4).
>
> Why?
>

It makes a copy of the slaves, has a fair bit of stack usage,
needs to check the slave status, and needs to examine each
packet to see if it is a slow protocol packet.  So each
packet is essentially read twice.  The fast queue code for mode 4
avoids some of this (and probably ignores checking collecting
incorrectly).  If you find a slow protocol packet, you need to
chop it out of the array with memmove due to overlap.



>
> > Bonding benefits from being able to read as much as possible (within
> limits of
> >course, large reads would blow out caches) from each slave.
>
> The slaves PMDs can benefits in the same way.
>
> >It can't return all that data though because applications tend to use the
> >burst size that would be efficient for a typical PMD.
>
> What is the preferred burst size of the bonding? Maybe the application
> should use it when they are using bonding.
>

The preferred burst size for bonding would be the sum of all the
slaves ideal read size.  However, that's not likely to be simple
since most applications decide early the size for the read/write
burst operations.


>
> >An alternative might be to ask bonding applications to simply issue
> larger reads for
> >certain modes.  That's probably not as easy as it sounds given the
> >way that the burst length effects multiplexing.
>
> Can you explain it more?
>

A longer burst size on one PMD will tend to favor that PMD
over others.  It will fill your internal queues with more
of its packets.



>
> >Another solution might be just alternatively poll the individual
> >slaves on each rx burst.  But that means you need to poll at a
> >faster rate.  Depending on your application, you might not be
> >able to do that.
>
> Again, can you be more precise in the above explanation?
>

If the application knows that there are two slaves backing
a bonding interface, the application could just read twice
from the bonding interface, knowing that the bonding
interface is going to alternate between the slaves.  But
this requires the application to know things about the bonding
PMD, like the number of slaves.



>
> >  We can avoid this scheduling overhead by just
> >doing the extra reads in bonding and buffering in a ring.
> >
> >Since bonding is going to be buffering everything in a ring,
>
> ? I don't familiar with it. For now I don't think we need a ring.
>

We have some empirical testing that shows performance improvements.
How do you explain this?  Rings aren't inefficient.  There's
significant overhead of calling the rx burst routine on any PMD.
You need to get the PMD data structures into local memory.
Reading as much as possible makes sense.  Could you generally
do this for all PMDs?  Yes, but the ring adds latency.  Latency
that isn't an issue for the bonding PMD because of all the
other inefficiencies (for mode 4).


>
>  >it makes sense to just read as much as is as efficiently possible.  For
> the
> >Intel adapters, this means using a read big enough trigger the vectorized
> >versions.
> >> > > So I'd like to share this improvement rather than keeping it private
> >> > > - because I'm nice that way :-P
> >> > >
> >> > > > > > Did you check for other vendor PMDs? It may hurt performance
> >> > > > > > there..
> >> > > > > >
> >> > > > > > I don't know, but I suspect probably not.  For the most part
> >> > > > > > you are typically reading almost up to the vector requirement.
> >> > > > > > But if one slave has just a single packet, then you can't
> >> > > > > > vectorize on the next slave.
> >> > > > > >
> >> > > > >
> >> > > > > I don't think that the ring overhead is better for PMDs which
> >> > > > > are not using the vectorized instructions.
> >> > > > >
> >> > > > > The non-vectorized PMDs are usually quite slow.  The additional
> >> > > > > overhead doesn't make a difference in their performance.
> >> > > >
> >> > > > We should not do things worse than they are.
> >> > >
> >> > > There were no reports that this made things worse. The feedback from
> >> > > production was that it made things better.
> >> >
> >> > Yes, It may be good for specific slaves drivers but hurt another
> >> > slaves drivers, So maybe it should stay private to specific costumers
> using
> >> specific nics.
> >> >
> >> > Again, I can understand how this patch improves performance of some
> >> > PMDs therefore I think the bonding is not the place to add it but
> maybe
> >> some PMDs.
> >
>
  
Matan Azrad Aug. 27, 2018, 3:30 p.m. UTC | #15
Hi Chas

From: Chas Williams <3chas3@gmail.com> 
>On Sun, Aug 26, 2018 at 3:40 AM Matan Azrad <mailto:matan@mellanox.com> wrote:
>
>From: Chas Williams <mailto:3chas3@gmail.com> 
>>On Thu, Aug 23, 2018 at 3:28 AM Matan Azrad <mailto:mailto:matan@mellanox.com> wrote:
>>Hi
>>
>>From: Eric Kinzie
>>> On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
>>> > Hi Luca
>>> >
>>> > From: Luca Boccassi
>>> > > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
>>> > > > Hi Chas
>>> > > >
>>> > > > From: Chas Williams
>>> > > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad
>>> > > > > <mailto:mailto:mailto:matan@mellanox .com> wrote:
>>> > > > > Hi Chas
>>> > > > >
>>> > > > > From: Chas Williams
>>> > > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad
>>> > > > > > <mailto:mailto:mailto:matan@mellano https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fx.com&data=02%7C01%7Cmatan%40mellanox.com%7C7ee011bf19224cb17d9a08d60c202379%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636709729507525613&sdata=TiWaRq8A%2FvUaqPIT6ajfJdcY7eIAGPqB%2BiHppcFdZJo%3D&reserved=0>
>>> > > > > > wrote:
>>> > > > > > Hi
>>> > > > > >
>>> > > > > > From: Chas Williams
>>> > > > > > > This will need to be implemented for some of the other RX
>>> > > > > > > burst methods at some point for other modes to see this
>>> > > > > > > performance improvement (with the exception of active-backup).
>>> > > > > >
>>> > > > > > Yes, I think it should be done at least to
>>> > > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for
>>> now.
>>> > > > > >
>>> > > > > > There is some duplicated code between the various RX paths.
>>> > > > > > I would like to eliminate that as much as possible, so I was
>>> > > > > > going to give that some thought first.
>>> > > > >
>>> > > > > There is no reason to stay this function as is while its twin is
>>> > > > > changed.
>>> > > > >
>>> > > > > Unfortunately, this is all the patch I have at this time.
>>> > > > >
>>> > > > >
>>> > > > > >
>>> > > > > >
>>> > > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi
>>> > > > > > > <mailto:mailto:mailto:bluca@deb https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fian.org&data=02%7C01%7Cmatan%40mellanox.com%7C7ee011bf19224cb17d9a08d60c202379%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636709729507525613&sdata=0ALmgN%2B6Xnrl3kKeeoJRGTlJmeLZlmcJwsOTcncMUWE%3D&reserved=0> wrote:
>>> > > > > > >
>>> > > > > > > > During bond 802.3ad receive, a burst of packets is fetched
>>> > > > > > > > from each slave into a local array and appended to
>>> > > > > > > > per-slave ring buffer.
>>> > > > > > > > Packets are taken from the head of the ring buffer and
>>> > > > > > > > returned to the caller.  The number of mbufs provided to
>>> > > > > > > > each slave is sufficient to meet the requirements of the
>>> > > > > > > > ixgbe vector receive.
>>> > > > > >
>>> > > > > > Luca,
>>> > > > > >
>>> > > > > > Can you explain these requirements of ixgbe?
>>> > > > > >
>>> > > > > > The ixgbe (and some other Intel PMDs) have vectorized RX
>>> > > > > > routines that are more efficient (if not faster) taking
>>> > > > > > advantage of some advanced CPU instructions.  I think you need
>>> > > > > > to be receiving at least 32 packets or more.
>>> > > > >
>>> > > > > So, why to do it in bond which is a generic driver for all the
>>> > > > > vendors PMDs, If for ixgbe and other Intel nics it is better you
>>> > > > > can force those PMDs to receive always 32 packets and to manage
>>> > > > > a ring by themselves.
>>> > > > >
>>> > > > > The drawback of the ring is some additional latency on the
>>> > > > > receive path.
>>> > > > > In testing, the additional latency hasn't been an issue for bonding.
>>> > > >
>>> > > > When bonding does processing slower it may be a bottleneck for the
>>> > > > packet processing for some application.
>>> > > >
>>> > > > > The bonding PMD has a fair bit of overhead associated with the
>>> > > > > RX and TX path calculations.  Most applications can just arrange
>>> > > > > to call the RX path with a sufficiently large receive.  Bonding
>>> > > > > can't do this.
>>> > > >
>>> > > > I didn't talk on application I talked on the slave PMDs, The slave
>>> > > > PMD can manage a ring by itself if it helps for its own performance.
>>> > > > The bonding should not be oriented to specific PMDs.
>>> > >
>>> > > The issue though is that the performance problem is not with the
>>> > > individual PMDs - it's with bonding. There were no reports regarding
>>> > > the individual PMDs.
>>> > > This comes from reports from customers from real world production
>>> > > deployments - the issue of bonding being too slow was raised multiple
>>> times.
>>> > > This patch addresses those issues, again in production deployments,
>>> > > where it's been used for years, to users and customers satisfaction.
>>> >
>>> > From Chas I understood that using burst of 32 helps for some slave PMDs
>>> performance which makes sense.
>>> > I can't understand how the extra copy phases improves the bonding itself
>>> performance:
>>> >
>>> > You added 2 copy phases in the bonding RX function:
>>> > 1. Get packets from the slave to a local array.
>>> > 2. Copy packet pointers from a local array to the ring array.
>>> > 3. Copy packet pointers from the ring array to the application array.
>>> >
>>> > Each packet arriving to the application must pass the above 3 phases(in a
>>> specific call or in previous calls).
>>> >
>>> > Without this patch we have only -
>>> > Get packets from the slave to the application array.
>>> >
>>> > Can you explain how the extra copies improves the bonding performance?
>>> >
>>> > Looks like it improves the slaves PMDs and because of that the bonding
>>> PMD performance becomes better.
>>> 
>>> I'm not sure that adding more buffer management to the vector PMDs will
>>> improve the drivers' performance; it's just that calling the rx function in such
>>> a way that it returns no data wastes time.
>>
>>Sorry, I don't fully understand what you said here, please rephrase.
>>
>>>  The bonding driver is already an exercise in buffer management so adding this layer of indirection here makes
>>> sense in my opinion, as does hiding the details of the consituent interfaces where possible.
>>
>>Can you explain how this new buffer management with the extra pointer copies improves the bonding itself performance?
>>Looks really strange to me.
>>
>>Because rings are generally quite efficient.
>
>But you are using a ring in addition to regular array management, it must hurt performance of the bonding PMD
>(means the bonding itself - not the slaves PMDs which are called from the bonding)
>
>>
>>Bonding is in a middle ground between application and PMD.
>Yes.
>>What bonding is doing, may not improve all applications.
>Yes, but it can be solved using some bonding modes.
>> If using a ring to buffer the vectorized receive routines, improves your particular application,
>>that's great. 
>It may be not great and even bad for some other PMDs which are not vectororized.
>
>> However, I don't think I can say that it would help all
>>applications.  As you point out, there is overhead associated with
>>a ring.
>Yes.
>>Bonding's receive burst isn't especially efficient (in mode 4).
>
>Why?
>
>It makes a copy of the slaves, has a fair bit of stack usage, 
>needs to check the slave status, and needs to examine each
>packet to see if it is a slow protocol packet.  So each
>packet is essentially read twice.  The fast queue code for mode 4
>avoids some of this (and probably ignores checking collecting
>incorrectly).  If you find a slow protocol packet, you need to
>chop it out of the array with memmove due to overlap.

Agree.
So to improve the bonding performance you need to optimize the aboves problems.
There is no connection to the ring.
 
>> Bonding benefits from being able to read as much as possible (within limits of
>>course, large reads would blow out caches) from each slave.
>
>The slaves PMDs can benefits in the same way.
>
>>It can't return all that data though because applications tend to use the 
>>burst size that would be efficient for a typical PMD.
>
>What is the preferred burst size of the bonding? Maybe the application should use it when they are using bonding.
>
>The preferred burst size for bonding would be the sum of all the
>slaves ideal read size.  However, that's not likely to be simple
>since most applications decide early the size for the read/write
>burst operations.

>>An alternative might be to ask bonding applications to simply issue larger reads for
>>certain modes.  That's probably not as easy as it sounds given the
>>way that the burst length effects multiplexing.
>
>Can you explain it more?
>
>A longer burst size on one PMD will tend to favor that PMD
>over others.  It will fill your internal queues with more 
>of its packets.

Agree, it's about fairness.
 
>
>>Another solution might be just alternatively poll the individual
>>slaves on each rx burst.  But that means you need to poll at a
>>faster rate.  Depending on your application, you might not be
>>able to do that.

>Again, can you be more precise in the above explanation?
>
>If the application knows that there are two slaves backing
>a bonding interface, the application could just read twice
>from the bonding interface, knowing that the bonding 
>interface is going to alternate between the slaves.  But
>this requires the application to know things about the bonding
>PMD, like the number of slaves.

Why should the application poll twice?
Poll slave 0, than process it's packets, poll slave 1 than process its packets...
What is the problem?

>>  We can avoid this scheduling overhead by just
>>doing the extra reads in bonding and buffering in a ring.
>>
>>Since bonding is going to be buffering everything in a ring,
>
>? I don't familiar with it. For now I don't think we need a ring.
>
>We have some empirical testing that shows performance improvements.
>How do you explain this?  

You are using a hack in bonding which hurt the bonding but improves the vectorized PMD you use.

>Rings aren't inefficient.

Only when there is a need to use it.

>There's significant overhead of calling the rx burst routine on any PMD.
>You need to get the PMD data structures into local memory.

Yes.

>Reading as much as possible makes sense.

Agree.

> Could you generally do this for all PMDs?  Yes, but the ring adds latency.  Latency
>that isn't an issue for the bonding PMD because of all the
>other inefficiencies (for mode 4).

Enlarging the bonding latency by that way makes some vectorized slave PMDs happy and makes the bonding worst 
for other slaves PMDs - this is not a good idea to put it upstream.

Improving the other problems in the bonding(reduce the bonding latency) will do the job for you and for others.

> >it makes sense to just read as much as is as efficiently possible.  For the
>>Intel adapters, this means using a read big enough trigger the vectorized
>>versions.
>>> > > So I'd like to share this improvement rather than keeping it private
>>> > > - because I'm nice that way :-P
>>> > >
>>> > > > > > Did you check for other vendor PMDs? It may hurt performance
>>> > > > > > there..
>>> > > > > >
>>> > > > > > I don't know, but I suspect probably not.  For the most part
>>> > > > > > you are typically reading almost up to the vector requirement.
>>> > > > > > But if one slave has just a single packet, then you can't
>>> > > > > > vectorize on the next slave.
>>> > > > > >
>>> > > > >
>>> > > > > I don't think that the ring overhead is better for PMDs which
>>> > > > > are not using the vectorized instructions.
>>> > > > >
>>> > > > > The non-vectorized PMDs are usually quite slow.  The additional
>>> > > > > overhead doesn't make a difference in their performance.
>>> > > >
>>> > > > We should not do things worse than they are.
>>> > >
>>> > > There were no reports that this made things worse. The feedback from
>>> > > production was that it made things better.
>>> >
>>> > Yes, It may be good for specific slaves drivers but hurt another
>>> > slaves drivers, So maybe it should stay private to specific costumers using
>>> specific nics.
>>> >
>>> > Again, I can understand how this patch improves performance of some
>>> > PMDs therefore I think the bonding is not the place to add it but maybe
>>> some PMDs.
>>
>
  
Chas Williams Aug. 27, 2018, 3:51 p.m. UTC | #16
On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <matan@mellanox.com> wrote:

> Hi Chas
>
> From: Chas Williams <3chas3@gmail.com>
> >On Sun, Aug 26, 2018 at 3:40 AM Matan Azrad <mailto:matan@mellanox.com>
> wrote:
> >
> >From: Chas Williams <mailto:3chas3@gmail.com>
> >>On Thu, Aug 23, 2018 at 3:28 AM Matan Azrad <mailto:mailto:
> matan@mellanox.com> wrote:
> >>Hi
> >>
> >>From: Eric Kinzie
> >>> On Wed Aug 22 11:42:37 +0000 2018, Matan Azrad wrote:
> >>> > Hi Luca
> >>> >
> >>> > From: Luca Boccassi
> >>> > > On Wed, 2018-08-22 at 07:09 +0000, Matan Azrad wrote:
> >>> > > > Hi Chas
> >>> > > >
> >>> > > > From: Chas Williams
> >>> > > > > On Tue, Aug 21, 2018 at 11:43 AM Matan Azrad
> >>> > > > > <mailto:mailto:mailto:matan@mellanox .com> wrote:
> >>> > > > > Hi Chas
> >>> > > > >
> >>> > > > > From: Chas Williams
> >>> > > > > > On Tue, Aug 21, 2018 at 6:56 AM Matan Azrad
> >>> > > > > > <mailto:mailto:mailto:matan@mellano
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fx.com&data=02%7C01%7Cmatan%40mellanox.com%7C7ee011bf19224cb17d9a08d60c202379%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636709729507525613&sdata=TiWaRq8A%2FvUaqPIT6ajfJdcY7eIAGPqB%2BiHppcFdZJo%3D&reserved=0
> >
> >>> > > > > > wrote:
> >>> > > > > > Hi
> >>> > > > > >
> >>> > > > > > From: Chas Williams
> >>> > > > > > > This will need to be implemented for some of the other RX
> >>> > > > > > > burst methods at some point for other modes to see this
> >>> > > > > > > performance improvement (with the exception of
> active-backup).
> >>> > > > > >
> >>> > > > > > Yes, I think it should be done at least to
> >>> > > > > > bond_ethdev_rx_burst_8023ad_fast_queue (should be easy) for
> >>> now.
> >>> > > > > >
> >>> > > > > > There is some duplicated code between the various RX paths.
> >>> > > > > > I would like to eliminate that as much as possible, so I was
> >>> > > > > > going to give that some thought first.
> >>> > > > >
> >>> > > > > There is no reason to stay this function as is while its twin
> is
> >>> > > > > changed.
> >>> > > > >
> >>> > > > > Unfortunately, this is all the patch I have at this time.
> >>> > > > >
> >>> > > > >
> >>> > > > > >
> >>> > > > > >
> >>> > > > > > > On Thu, Aug 16, 2018 at 9:32 AM Luca Boccassi
> >>> > > > > > > <mailto:mailto:mailto:bluca@deb
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fian.org&data=02%7C01%7Cmatan%40mellanox.com%7C7ee011bf19224cb17d9a08d60c202379%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636709729507525613&sdata=0ALmgN%2B6Xnrl3kKeeoJRGTlJmeLZlmcJwsOTcncMUWE%3D&reserved=0>
> wrote:
> >>> > > > > > >
> >>> > > > > > > > During bond 802.3ad receive, a burst of packets is
> fetched
> >>> > > > > > > > from each slave into a local array and appended to
> >>> > > > > > > > per-slave ring buffer.
> >>> > > > > > > > Packets are taken from the head of the ring buffer and
> >>> > > > > > > > returned to the caller.  The number of mbufs provided to
> >>> > > > > > > > each slave is sufficient to meet the requirements of the
> >>> > > > > > > > ixgbe vector receive.
> >>> > > > > >
> >>> > > > > > Luca,
> >>> > > > > >
> >>> > > > > > Can you explain these requirements of ixgbe?
> >>> > > > > >
> >>> > > > > > The ixgbe (and some other Intel PMDs) have vectorized RX
> >>> > > > > > routines that are more efficient (if not faster) taking
> >>> > > > > > advantage of some advanced CPU instructions.  I think you
> need
> >>> > > > > > to be receiving at least 32 packets or more.
> >>> > > > >
> >>> > > > > So, why to do it in bond which is a generic driver for all the
> >>> > > > > vendors PMDs, If for ixgbe and other Intel nics it is better
> you
> >>> > > > > can force those PMDs to receive always 32 packets and to manage
> >>> > > > > a ring by themselves.
> >>> > > > >
> >>> > > > > The drawback of the ring is some additional latency on the
> >>> > > > > receive path.
> >>> > > > > In testing, the additional latency hasn't been an issue for
> bonding.
> >>> > > >
> >>> > > > When bonding does processing slower it may be a bottleneck for
> the
> >>> > > > packet processing for some application.
> >>> > > >
> >>> > > > > The bonding PMD has a fair bit of overhead associated with the
> >>> > > > > RX and TX path calculations.  Most applications can just
> arrange
> >>> > > > > to call the RX path with a sufficiently large receive.  Bonding
> >>> > > > > can't do this.
> >>> > > >
> >>> > > > I didn't talk on application I talked on the slave PMDs, The
> slave
> >>> > > > PMD can manage a ring by itself if it helps for its own
> performance.
> >>> > > > The bonding should not be oriented to specific PMDs.
> >>> > >
> >>> > > The issue though is that the performance problem is not with the
> >>> > > individual PMDs - it's with bonding. There were no reports
> regarding
> >>> > > the individual PMDs.
> >>> > > This comes from reports from customers from real world production
> >>> > > deployments - the issue of bonding being too slow was raised
> multiple
> >>> times.
> >>> > > This patch addresses those issues, again in production deployments,
> >>> > > where it's been used for years, to users and customers
> satisfaction.
> >>> >
> >>> > From Chas I understood that using burst of 32 helps for some slave
> PMDs
> >>> performance which makes sense.
> >>> > I can't understand how the extra copy phases improves the bonding
> itself
> >>> performance:
> >>> >
> >>> > You added 2 copy phases in the bonding RX function:
> >>> > 1. Get packets from the slave to a local array.
> >>> > 2. Copy packet pointers from a local array to the ring array.
> >>> > 3. Copy packet pointers from the ring array to the application array.
> >>> >
> >>> > Each packet arriving to the application must pass the above 3
> phases(in a
> >>> specific call or in previous calls).
> >>> >
> >>> > Without this patch we have only -
> >>> > Get packets from the slave to the application array.
> >>> >
> >>> > Can you explain how the extra copies improves the bonding
> performance?
> >>> >
> >>> > Looks like it improves the slaves PMDs and because of that the
> bonding
> >>> PMD performance becomes better.
> >>>
> >>> I'm not sure that adding more buffer management to the vector PMDs will
> >>> improve the drivers' performance; it's just that calling the rx
> function in such
> >>> a way that it returns no data wastes time.
> >>
> >>Sorry, I don't fully understand what you said here, please rephrase.
> >>
> >>>  The bonding driver is already an exercise in buffer management so
> adding this layer of indirection here makes
> >>> sense in my opinion, as does hiding the details of the consituent
> interfaces where possible.
> >>
> >>Can you explain how this new buffer management with the extra pointer
> copies improves the bonding itself performance?
> >>Looks really strange to me.
> >>
> >>Because rings are generally quite efficient.
> >
> >But you are using a ring in addition to regular array management, it must
> hurt performance of the bonding PMD
> >(means the bonding itself - not the slaves PMDs which are called from the
> bonding)
>

It adds latency.  It increases performance because we spend less CPU
time reading from the PMDs.  This means we have more CPU to use for
post processing (i.e. routing).



> >
> >>
> >>Bonding is in a middle ground between application and PMD.
> >Yes.
> >>What bonding is doing, may not improve all applications.
> >Yes, but it can be solved using some bonding modes.
> >> If using a ring to buffer the vectorized receive routines, improves
> your particular application,
> >>that's great.
> >It may be not great and even bad for some other PMDs which are not
> vectororized.
> >
> >> However, I don't think I can say that it would help all
> >>applications.  As you point out, there is overhead associated with
> >>a ring.
> >Yes.
> >>Bonding's receive burst isn't especially efficient (in mode 4).
> >
> >Why?
> >
> >It makes a copy of the slaves, has a fair bit of stack usage,
> >needs to check the slave status, and needs to examine each
> >packet to see if it is a slow protocol packet.  So each
> >packet is essentially read twice.  The fast queue code for mode 4
> >avoids some of this (and probably ignores checking collecting
> >incorrectly).  If you find a slow protocol packet, you need to
> >chop it out of the array with memmove due to overlap.
>
> Agree.
> So to improve the bonding performance you need to optimize the aboves
> problems.
> There is no connection to the ring.
>

And as I have described numerous times, these problems
can't be easily fixed and preserve the existing API.



>
> >> Bonding benefits from being able to read as much as possible (within
> limits of
> >>course, large reads would blow out caches) from each slave.
> >
> >The slaves PMDs can benefits in the same way.
> >
> >>It can't return all that data though because applications tend to use
> the
> >>burst size that would be efficient for a typical PMD.
> >
> >What is the preferred burst size of the bonding? Maybe the application
> should use it when they are using bonding.
> >
> >The preferred burst size for bonding would be the sum of all the
> >slaves ideal read size.  However, that's not likely to be simple
> >since most applications decide early the size for the read/write
> >burst operations.
> >
> >>An alternative might be to ask bonding applications to simply issue
> larger reads for
> >>certain modes.  That's probably not as easy as it sounds given the
> >>way that the burst length effects multiplexing.
> >
> >Can you explain it more?
> >
> >A longer burst size on one PMD will tend to favor that PMD
> >over others.  It will fill your internal queues with more
> >of its packets.
>
> Agree, it's about fairness.
>
> >
> >>Another solution might be just alternatively poll the individual
> >>slaves on each rx burst.  But that means you need to poll at a
> >>faster rate.  Depending on your application, you might not be
> >>able to do that.
>
> >Again, can you be more precise in the above explanation?
> >
> >If the application knows that there are two slaves backing
> >a bonding interface, the application could just read twice
> >from the bonding interface, knowing that the bonding
> >interface is going to alternate between the slaves.  But
> >this requires the application to know things about the bonding
> >PMD, like the number of slaves.
>
> Why should the application poll twice?
> Poll slave 0, than process it's packets, poll slave 1 than process its
> packets...
> What is the problem?
>

B
ecause let's say that each slave is 10G and you are using

link aggregation with two slaves.  If you have tuned your

application on the assumption that a PMD is approximately
10G, then you are going to be under polling the bonding PMD.


For the above to work, you need to ensure that the application

is polling sufficiently to keep up.


>
> >>  We can avoid this scheduling overhead by just
> >>doing the extra reads in bonding and buffering in a ring.
> >>
> >>Since bonding is going to be buffering everything in a ring,
> >
> >? I don't familiar with it. For now I don't think we need a ring.
> >
> >We have some empirical testing that shows performance improvements.
> >How do you explain this?
>
> You are using a hack in bonding which hurt the bonding but improves the
> vectorized PMD you use.
>
> >Rings aren't inefficient.
>
> Only when there is a need to use it.
>

I believe we have identified a need.


>
> >There's significant overhead of calling the rx burst routine on any PMD.
> >You need to get the PMD data structures into local memory.
>
> Yes.
>
> >Reading as much as possible makes sense.
>
> Agree.
>
> > Could you generally do this for all PMDs?  Yes, but the ring adds
> latency.  Latency
> >that isn't an issue for the bonding PMD because of all the
> >other inefficiencies (for mode 4).
>
> Enlarging the bonding latency by that way makes some vectorized slave PMDs
> happy and makes the bonding worst
> for other slaves PMDs - this is not a good idea to put it upstream.
>
> Improving the other problems in the bonding(reduce the bonding latency)
> will do the job for you and for others.
>

Latency is not the issue with bonding.  The issue with bonding
is the overhead associated with making an rx burst call.  We
add latency (via rings) to make part of the bonding driver more
efficient.

Again, I suspect it even helps the non-vectorized PMDs.  Calling
a PMD's rx burst routine is costly and we are switching between
PMD's inside the bonding driver.  Bonding is halfway between an
application and a PMD.  What we are doing in the bonding PMD is
what an application would typically do.  But instead of forcing
all the bonding users to do this, we are doing it for them.



>
> > >it makes sense to just read as much as is as efficiently possible.  For
> the
> >>Intel adapters, this means using a read big enough trigger the vectorized
> >>versions.
> >>> > > So I'd like to share this improvement rather than keeping it
> private
> >>> > > - because I'm nice that way :-P
> >>> > >
> >>> > > > > > Did you check for other vendor PMDs? It may hurt performance
> >>> > > > > > there..
> >>> > > > > >
> >>> > > > > > I don't know, but I suspect probably not.  For the most part
> >>> > > > > > you are typically reading almost up to the vector
> requirement.
> >>> > > > > > But if one slave has just a single packet, then you can't
> >>> > > > > > vectorize on the next slave.
> >>> > > > > >
> >>> > > > >
> >>> > > > > I don't think that the ring overhead is better for PMDs which
> >>> > > > > are not using the vectorized instructions.
> >>> > > > >
> >>> > > > > The non-vectorized PMDs are usually quite slow.  The additional
> >>> > > > > overhead doesn't make a difference in their performance.
> >>> > > >
> >>> > > > We should not do things worse than they are.
> >>> > >
> >>> > > There were no reports that this made things worse. The feedback
> from
> >>> > > production was that it made things better.
> >>> >
> >>> > Yes, It may be good for specific slaves drivers but hurt another
> >>> > slaves drivers, So maybe it should stay private to specific
> costumers using
> >>> specific nics.
> >>> >
> >>> > Again, I can understand how this patch improves performance of some
> >>> > PMDs therefore I think the bonding is not the place to add it but
> maybe
> >>> some PMDs.
> >>
> >
>
  
Matan Azrad Aug. 28, 2018, 9:51 a.m. UTC | #17
From: Chas Williams
>On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <mailto:matan@mellanox.com> wrote:
<snip>
>>>Because rings are generally quite efficient.
>>
>>But you are using a ring in addition to regular array management, it must hurt performance of the bonding PMD
>>(means the bonding itself - not the slaves PMDs which are called from the bonding)
>
>It adds latency.

And by that hurts the application performance because it takes more CPU time in the bonding PMD.

>It increases performance because we spend less CPU time reading from the PMDs.

So, it's hack in the bonding PMD to improve some slaves code performance but hurt the bonding code performance,
Over all the performance we gain for those slaves improves the application performance only when working with those slaves. 
But may hurt the application performance when working with other slaves.

>  This means we have more CPU to use for
>post processing (i.e. routing).



>>>Bonding is in a middle ground between application and PMD.
>>Yes.
>>>What bonding is doing, may not improve all applications.
>>Yes, but it can be solved using some bonding modes.
>>> If using a ring to buffer the vectorized receive routines, improves your particular application,
>>>that's great. 
>>It may be not great and even bad for some other PMDs which are not vectororized.
>>
>>> However, I don't think I can say that it would help all
>>>applications.  As you point out, there is overhead associated with
>>>a ring.
>>Yes.
>>>Bonding's receive burst isn't especially efficient (in mode 4).
>>
>>Why?
>>
>>It makes a copy of the slaves, has a fair bit of stack usage, 
>>needs to check the slave status, and needs to examine each
>>packet to see if it is a slow protocol packet.  So each
>>packet is essentially read twice.  The fast queue code for mode 4
>>avoids some of this (and probably ignores checking collecting
>>incorrectly).  If you find a slow protocol packet, you need to
>>chop it out of the array with memmove due to overlap.
>
>Agree.
>So to improve the bonding performance you need to optimize the aboves problems.
>There is no connection to the ring.
>
>And as I have described numerous times, these problems
>can't be easily fixed and preserve the existing API.

Sometimes we need to work harder to see a gain for all.
We should not apply a patch because it is easy and show a gain for specific scenarios.

>>> Bonding benefits from being able to read as much as possible (within limits of
>>>course, large reads would blow out caches) from each slave.
>>
>>The slaves PMDs can benefits in the same way.
>>
>>>It can't return all that data though because applications tend to use the 
>>>burst size that would be efficient for a typical PMD.
>>
>>What is the preferred burst size of the bonding? Maybe the application should use it when they are using bonding.
>>
>>The preferred burst size for bonding would be the sum of all the
>>slaves ideal read size.  However, that's not likely to be simple
>>since most applications decide early the size for the read/write
>>burst operations.
>> 
>>>An alternative might be to ask bonding applications to simply issue larger reads for
>>>certain modes.  That's probably not as easy as it sounds given the
>>>way that the burst length effects multiplexing.
>>
>>Can you explain it more?
>>
>>A longer burst size on one PMD will tend to favor that PMD
>>over others.  It will fill your internal queues with more 
>>of its packets.
>
>Agree, it's about fairness.

>>
>>>Another solution might be just alternatively poll the individual
>>>slaves on each rx burst.  But that means you need to poll at a
>>>faster rate.  Depending on your application, you might not be
>>>able to do that.
>
>>Again, can you be more precise in the above explanation?
>>
>>If the application knows that there are two slaves backing
>>a bonding interface, the application could just read twice
>>from the bonding interface, knowing that the bonding 
>>interface is going to alternate between the slaves.  But
>>this requires the application to know things about the bonding
>>PMD, like the number of slaves.
>
>Why should the application poll twice?
>Poll slave 0, than process it's packets, poll slave 1 than process its packets...
>What is the problem?
>
>Because let's say that each slave is 10G and you are using
>
>link aggregation with two slaves.  If you have tuned your
>
>application on the assumption that a PMD is approximately
>10G, then you are going to be under polling the bonding PMD.
>For the above to work, you need to ensure that the application
>is polling sufficiently to keep up.

But each poll will be shorter, no slaves loops, only one slave call.

>>>  We can avoid this scheduling overhead by just
>>>doing the extra reads in bonding and buffering in a ring.
>>>
>>>Since bonding is going to be buffering everything in a ring,
>>
>>? I don't familiar with it. For now I don't think we need a ring.
>>
>>We have some empirical testing that shows performance improvements.
>>How do you explain this?  
>
>You are using a hack in bonding which hurt the bonding but improves the vectorized PMD you use.
>
>>Rings aren't inefficient.
>
>Only when there is a need to use it.
>
>I believe we have identified a need.

>
>>There's significant overhead of calling the rx burst routine on any PMD.
>>You need to get the PMD data structures into local memory.
>
>Yes.
>
>>Reading as much as possible makes sense.
>
>Agree.
>
>> Could you generally do this for all PMDs?  Yes, but the ring adds latency.  Latency
>>that isn't an issue for the bonding PMD because of all the
>>other inefficiencies (for mode 4).
>
>Enlarging the bonding latency by that way makes some vectorized slave PMDs happy and makes the bonding worst 
>for other slaves PMDs - this is not a good idea to put it upstream.
>
>Improving the other problems in the bonding(reduce the bonding latency) will do the job for you and for others.
>
>Latency is not the issue with bonding.  The issue with bonding
>is the overhead associated with making an rx burst call.  We
>add latency (via rings) to make part of the bonding driver more
>efficient.
>
>Again, I suspect it even helps the non-vectorized PMDs.
 >Calling a PMD's rx burst routine is costly and we are switching between
>PMD's inside the bonding driver.  Bonding is halfway between an
>application and a PMD.  What we are doing in the bonding PMD is
>what an application would typically do.  But instead of forcing
>all the bonding users to do this, we are doing it for them.

Not agree.
The overhead in bonding may be critical for some other PMDs\scenario.

To summarize:
We are not agree.

Bottom line in my side,
It is not a good Idea to add overhead in the bonding PMD which is a generic PMD just to get gain for some specific scenarios in some specific PMDs while for other scenarios\PMDs it is bad.

Matan.
  
Chas Williams Aug. 29, 2018, 2:30 p.m. UTC | #18
On Tue, Aug 28, 2018 at 5:51 AM Matan Azrad <matan@mellanox.com> wrote:

>
>
> From: Chas Williams
> >On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <mailto:matan@mellanox.com>
> wrote:
> <snip>
> >>>Because rings are generally quite efficient.
> >>
> >>But you are using a ring in addition to regular array management, it
> must hurt performance of the bonding PMD
> >>(means the bonding itself - not the slaves PMDs which are called from
> the bonding)
> >
> >It adds latency.
>
> And by that hurts the application performance because it takes more CPU
> time in the bonding PMD.
>

No, as I said before it takes _less_ CPU time in the bonding PMD
because we use a more optimal read from the slaves.


>
> >It increases performance because we spend less CPU time reading from the
> PMDs.
>
> So, it's hack in the bonding PMD to improve some slaves code performance
> but hurt the bonding code performance,
> Over all the performance we gain for those slaves improves the application
> performance only when working with those slaves.
> But may hurt the application performance when working with other slaves.
>

What is your evidence that is hurts bonding performance?  Your
argument is purely theoretical.  I could easily argue than even
for non-vectorized PMDs there is a performance gain because we
spend less time switching between PMDs.  If you are going to read
from a PMD you should attempt to read as much as possible. It's
expensive to read the cards registers and perform the queue
manipulations.



>
> >  This means we have more CPU to use for
> >post processing (i.e. routing).
>
>
>
> >>>Bonding is in a middle ground between application and PMD.
> >>Yes.
> >>>What bonding is doing, may not improve all applications.
> >>Yes, but it can be solved using some bonding modes.
> >>> If using a ring to buffer the vectorized receive routines, improves
> your particular application,
> >>>that's great.
> >>It may be not great and even bad for some other PMDs which are not
> vectororized.
> >>
> >>> However, I don't think I can say that it would help all
> >>>applications.  As you point out, there is overhead associated with
> >>>a ring.
> >>Yes.
> >>>Bonding's receive burst isn't especially efficient (in mode 4).
> >>
> >>Why?
> >>
> >>It makes a copy of the slaves, has a fair bit of stack usage,
> >>needs to check the slave status, and needs to examine each
> >>packet to see if it is a slow protocol packet.  So each
> >>packet is essentially read twice.  The fast queue code for mode 4
> >>avoids some of this (and probably ignores checking collecting
> >>incorrectly).  If you find a slow protocol packet, you need to
> >>chop it out of the array with memmove due to overlap.
> >
> >Agree.
> >So to improve the bonding performance you need to optimize the aboves
> problems.
> >There is no connection to the ring.
> >
> >And as I have described numerous times, these problems
> >can't be easily fixed and preserve the existing API.
>
> Sometimes we need to work harder to see a gain for all.
> We should not apply a patch because it is easy and show a gain for
> specific scenarios.
>
> >>> Bonding benefits from being able to read as much as possible (within
> limits of
> >>>course, large reads would blow out caches) from each slave.
> >>
> >>The slaves PMDs can benefits in the same way.
> >>
> >>>It can't return all that data though because applications tend to use
> the
> >>>burst size that would be efficient for a typical PMD.
> >>
> >>What is the preferred burst size of the bonding? Maybe the application
> should use it when they are using bonding.
> >>
> >>The preferred burst size for bonding would be the sum of all the
> >>slaves ideal read size.  However, that's not likely to be simple
> >>since most applications decide early the size for the read/write
> >>burst operations.
> >>
> >>>An alternative might be to ask bonding applications to simply issue
> larger reads for
> >>>certain modes.  That's probably not as easy as it sounds given the
> >>>way that the burst length effects multiplexing.
> >>
> >>Can you explain it more?
> >>
> >>A longer burst size on one PMD will tend to favor that PMD
> >>over others.  It will fill your internal queues with more
> >>of its packets.
> >
> >Agree, it's about fairness.
> >
> >>
> >>>Another solution might be just alternatively poll the individual
> >>>slaves on each rx burst.  But that means you need to poll at a
> >>>faster rate.  Depending on your application, you might not be
> >>>able to do that.
> >
> >>Again, can you be more precise in the above explanation?
> >>
> >>If the application knows that there are two slaves backing
> >>a bonding interface, the application could just read twice
> >>from the bonding interface, knowing that the bonding
> >>interface is going to alternate between the slaves.  But
> >>this requires the application to know things about the bonding
> >>PMD, like the number of slaves.
> >
> >Why should the application poll twice?
> >Poll slave 0, than process it's packets, poll slave 1 than process its
> packets...
> >What is the problem?
> >
> >Because let's say that each slave is 10G and you are using
> >
> >link aggregation with two slaves.  If you have tuned your
> >
> >application on the assumption that a PMD is approximately
> >10G, then you are going to be under polling the bonding PMD.
> >For the above to work, you need to ensure that the application
> >is polling sufficiently to keep up.
>
> But each poll will be shorter, no slaves loops, only one slave call.
>

But you still need to poll the bonding PMD N times as
fast where N is the number of slaves.  The "scheduler"
in the application may not be aware of that.  That was
(apparently) the point of the way the current bonding
PMD.  It hides everything from the application.



>
> >>>  We can avoid this scheduling overhead by just
> >>>doing the extra reads in bonding and buffering in a ring.
> >>>
> >>>Since bonding is going to be buffering everything in a ring,
> >>
> >>? I don't familiar with it. For now I don't think we need a ring.
> >>
> >>We have some empirical testing that shows performance improvements.
> >>How do you explain this?
> >
> >You are using a hack in bonding which hurt the bonding but improves the
> vectorized PMD you use.
> >
> >>Rings aren't inefficient.
> >
> >Only when there is a need to use it.
> >
> >I believe we have identified a need.
> >
> >
> >>There's significant overhead of calling the rx burst routine on any PMD.
> >>You need to get the PMD data structures into local memory.
> >
> >Yes.
> >
> >>Reading as much as possible makes sense.
> >
> >Agree.
> >
> >> Could you generally do this for all PMDs?  Yes, but the ring adds
> latency.  Latency
> >>that isn't an issue for the bonding PMD because of all the
> >>other inefficiencies (for mode 4).
> >
> >Enlarging the bonding latency by that way makes some vectorized slave
> PMDs happy and makes the bonding worst
> >for other slaves PMDs - this is not a good idea to put it upstream.
> >
> >Improving the other problems in the bonding(reduce the bonding latency)
> will do the job for you and for others.
> >
> >Latency is not the issue with bonding.  The issue with bonding
> >is the overhead associated with making an rx burst call.  We
> >add latency (via rings) to make part of the bonding driver more
> >efficient.
> >
> >Again, I suspect it even helps the non-vectorized PMDs.
>  >Calling a PMD's rx burst routine is costly and we are switching between
> >PMD's inside the bonding driver.  Bonding is halfway between an
> >application and a PMD.  What we are doing in the bonding PMD is
> >what an application would typically do.  But instead of forcing
> >all the bonding users to do this, we are doing it for them.
>
> Not agree.
> The overhead in bonding may be critical for some other PMDs\scenario.
>

The overhead is bonding is undesirable.  It's not easy to address
because of the initial design goals.


>
> To summarize:
> We are not agree.
>
> Bottom line in my side,
> It is not a good Idea to add overhead in the bonding PMD which is a
> generic PMD just to get gain for some specific scenarios in some specific
> PMDs while for other scenarios\PMDs it is bad.
>

You are making some assumptions which are simply not valid.
BOnding is _not_ a generic PMD.  As discussed above, bonding
is somewhere between application and PMD.  The choices made
for bonding where to make it easy to integrate into an existing
application with the application having to know anything about
bonding.


> Matan.
>
>
>
>
>
  
Matan Azrad Aug. 29, 2018, 3:20 p.m. UTC | #19
From: Chas Williams
>On Tue, Aug 28, 2018 at 5:51 AM Matan Azrad <mailto:matan@mellanox.com> wrote:
>
>
>From: Chas Williams
>>On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <mailto:mailto:matan@mellanox.com> wrote:
><snip>
>>>>Because rings are generally quite efficient.
>>>
>>>But you are using a ring in addition to regular array management, it must hurt performance of the bonding PMD
>>>(means the bonding itself - not the slaves PMDs which are called from the bonding)
>>
>>It adds latency.
>
>And by that hurts the application performance because it takes more CPU time in the bonding PMD.
>
>No, as I said before it takes _less_ CPU time in the bonding PMD
>because we use a more optimal read from the slaves.

Each packet pointer should be copied more 2 times because of this patch + some management(the ring overhead)
So in the bonding code you lose performance.

>
>>It increases performance because we spend less CPU time reading from the PMDs.
>
>So, it's hack in the bonding PMD to improve some slaves code performance but hurt the bonding code performance,
>Over all the performance we gain for those slaves improves the application performance only when working with those slaves. 
>But may hurt the application performance when working with other slaves.
>
>What is your evidence that is hurts bonding performance?  Your
>argument is purely theoretical.
Yes, we cannot test all the scenarios cross the PMDs.

>  I could easily argue than even for non-vectorized PMDs there is a performance gain because we
>spend less time switching between PMDs.

But spend more time in the bonding part.

 > If you are going to read from a PMD you should attempt to read as much as possible. It's
>expensive to read the cards registers and perform the queue
>manipulations.

You do it anyway.

The context changing is expensive but also the extra copies per packet and the ring management.

We have here tradeoff that may be affect differently for other scenarios.

>
>>  This means we have more CPU to use for
>>post processing (i.e. routing).
>
>
>
>>>>Bonding is in a middle ground between application and PMD.
>>>Yes.
>>>>What bonding is doing, may not improve all applications.
>>>Yes, but it can be solved using some bonding modes.
>>>> If using a ring to buffer the vectorized receive routines, improves your particular application,
>>>>that's great. 
>>>It may be not great and even bad for some other PMDs which are not vectororized.
>>>
>>>> However, I don't think I can say that it would help all
>>>>applications.  As you point out, there is overhead associated with
>>>>a ring.
>>>Yes.
>>>>Bonding's receive burst isn't especially efficient (in mode 4).
>>>
>>>Why?
>>>
>>>It makes a copy of the slaves, has a fair bit of stack usage, 
>>>needs to check the slave status, and needs to examine each
>>>packet to see if it is a slow protocol packet.  So each
>>>packet is essentially read twice.  The fast queue code for mode 4
>>>avoids some of this (and probably ignores checking collecting
>>>incorrectly).  If you find a slow protocol packet, you need to
>>>chop it out of the array with memmove due to overlap.
>>
>>Agree.
>>So to improve the bonding performance you need to optimize the aboves problems.
>>There is no connection to the ring.
>>
>>And as I have described numerous times, these problems
>>can't be easily fixed and preserve the existing API.
>
>Sometimes we need to work harder to see a gain for all.
>We should not apply a patch because it is easy and show a gain for specific scenarios.
>
>>>> Bonding benefits from being able to read as much as possible (within limits of
>>>>course, large reads would blow out caches) from each slave.
>>>
>>>The slaves PMDs can benefits in the same way.
>>>
>>>>It can't return all that data though because applications tend to use the 
>>>>burst size that would be efficient for a typical PMD.
>>>
>>>What is the preferred burst size of the bonding? Maybe the application should use it when they are using bonding.
>>>
>>>The preferred burst size for bonding would be the sum of all the
>>>slaves ideal read size.  However, that's not likely to be simple
>>>since most applications decide early the size for the read/write
>>>burst operations.
>>> 
>>>>An alternative might be to ask bonding applications to simply issue larger reads for
>>>>certain modes.  That's probably not as easy as it sounds given the
>>>>way that the burst length effects multiplexing.
>>>
>>>Can you explain it more?
>>>
>>>A longer burst size on one PMD will tend to favor that PMD
>>>over others.  It will fill your internal queues with more 
>>>of its packets.
>>
>>Agree, it's about fairness.
>> 
>>>
>>>>Another solution might be just alternatively poll the individual
>>>>slaves on each rx burst.  But that means you need to poll at a
>>>>faster rate.  Depending on your application, you might not be
>>>>able to do that.
>>
>>>Again, can you be more precise in the above explanation?
>>>
>>>If the application knows that there are two slaves backing
>>>a bonding interface, the application could just read twice
>>>from the bonding interface, knowing that the bonding 
>>>interface is going to alternate between the slaves.  But
>>>this requires the application to know things about the bonding
>>>PMD, like the number of slaves.
>>
>>Why should the application poll twice?
>>Poll slave 0, than process it's packets, poll slave 1 than process its packets...
>>What is the problem?
>>
>>Because let's say that each slave is 10G and you are using
>>
>>link aggregation with two slaves.  If you have tuned your
>>
>>application on the assumption that a PMD is approximately
>>10G, then you are going to be under polling the bonding PMD.
>>For the above to work, you need to ensure that the application
>>is polling sufficiently to keep up.
>
>But each poll will be shorter, no slaves loops, only one slave call.
>
>But you still need to poll the bonding PMD N times as
>fast where N is the number of slaves.  The "scheduler"
>in the application may not be aware of that.  That was
>(apparently) the point of the way the current bonding
>PMD.  It hides everything from the application.

Sorry, I don't sure I understand you here.
Also here we have tradeoff:
Read full burst from each slave + less bonding code per burst
against
More bonding calls(it is not in N relation it depends in heavy and the burst size).
 

>>>>  We can avoid this scheduling overhead by just
>>>>doing the extra reads in bonding and buffering in a ring.
>>>>
>>>>Since bonding is going to be buffering everything in a ring,
>>>
>>>? I don't familiar with it. For now I don't think we need a ring.
>>>
>>>We have some empirical testing that shows performance improvements.
>>>How do you explain this?  
>>
>>You are using a hack in bonding which hurt the bonding but improves the vectorized PMD you use.
>>
>>>Rings aren't inefficient.
>>
>>Only when there is a need to use it.
>>
>>I believe we have identified a need.
>> 
>>
>>>There's significant overhead of calling the rx burst routine on any PMD.
>>>You need to get the PMD data structures into local memory.
>>
>>Yes.
>>
>>>Reading as much as possible makes sense.
>>
>>Agree.
>>
>>> Could you generally do this for all PMDs?  Yes, but the ring adds latency.  Latency
>>>that isn't an issue for the bonding PMD because of all the
>>>other inefficiencies (for mode 4).
>>
>>Enlarging the bonding latency by that way makes some vectorized slave PMDs happy and makes the bonding worst 
>>for other slaves PMDs - this is not a good idea to put it upstream.
>>
>>Improving the other problems in the bonding(reduce the bonding latency) will do the job for you and for others.
>>
>>Latency is not the issue with bonding.  The issue with bonding
>>is the overhead associated with making an rx burst call.  We
>>add latency (via rings) to make part of the bonding driver more
>>efficient.
>>
>>Again, I suspect it even helps the non-vectorized PMDs.
> >Calling a PMD's rx burst routine is costly and we are switching between
>>PMD's inside the bonding driver.  Bonding is halfway between an
>>application and a PMD.  What we are doing in the bonding PMD is
>>what an application would typically do.  But instead of forcing
>>all the bonding users to do this, we are doing it for them.
>
>Not agree.
>The overhead in bonding may be critical for some other PMDs\scenario.
>
>The overhead is bonding is undesirable.  It's not easy to address
>because of the initial design goals.

>
>To summarize:
>We are not agree.
>
>Bottom line in my side,
>It is not a good Idea to add overhead in the bonding PMD which is a generic PMD just to get gain for some specific scenarios in some specific PMDs while for other scenarios\PMDs it is bad.
>
>You are making some assumptions which are simply not valid.
>BOnding is _not_ a generic PMD.
What?
Should the bonding be good only for specific scenarios with specific PMDs?
If no, it is generic.

>  As discussed above, bonding
>is somewhere between application and PMD.
Yes.

>  The choices made
>for bonding where to make it easy to integrate into an existing
>application with the application having to know anything about
>bonding.
The bonding should be generic at least from the next perspectives:
Different PMDs.
Different application scenarios.





>
>Matan. 
>
>
>
>
  
Luca Boccassi Aug. 31, 2018, 4:01 p.m. UTC | #20
On Wed, 2018-08-29 at 15:20 +0000, Matan Azrad wrote:
> 
> From: Chas Williams
> > On Tue, Aug 28, 2018 at 5:51 AM Matan Azrad <mailto:matan@mellanox.
> > com> wrote:
> > 
> > 
> > From: Chas Williams
> > > On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <mailto:mailto:matan
> > > @mellanox.com> wrote:
> > 
> > <snip>
> > > > > Because rings are generally quite efficient.
> > > > 
> > > > But you are using a ring in addition to regular array
> > > > management, it must hurt performance of the bonding PMD
> > > > (means the bonding itself - not the slaves PMDs which are
> > > > called from the bonding)
> > > 
> > > It adds latency.
> > 
> > And by that hurts the application performance because it takes more
> > CPU time in the bonding PMD.
> > 
> > No, as I said before it takes _less_ CPU time in the bonding PMD
> > because we use a more optimal read from the slaves.
> 
> Each packet pointer should be copied more 2 times because of this
> patch + some management(the ring overhead)
> So in the bonding code you lose performance.
> 
> > 
> > > It increases performance because we spend less CPU time reading
> > > from the PMDs.
> > 
> > So, it's hack in the bonding PMD to improve some slaves code
> > performance but hurt the bonding code performance,
> > Over all the performance we gain for those slaves improves the
> > application performance only when working with those slaves. 
> > But may hurt the application performance when working with other
> > slaves.
> > 
> > What is your evidence that is hurts bonding performance?  Your
> > argument is purely theoretical.
> 
> Yes, we cannot test all the scenarios cross the PMDs.

Chas has evidence that this helps, a _lot_, in some very common cases.
We haven't seen evidence of negative impact anywhere in 2 years. Given
this, surely it's not unreasonable to ask to substantiate theoretical
arguments with some testing?

> >   I could easily argue than even for non-vectorized PMDs there is a
> > performance gain because we
> > spend less time switching between PMDs.
> 
> But spend more time in the bonding part.
> 
>  > If you are going to read from a PMD you should attempt to read as
> much as possible. It's
> > expensive to read the cards registers and perform the queue
> > manipulations.
> 
> You do it anyway.
> 
> The context changing is expensive but also the extra copies per
> packet and the ring management.
> 
> We have here tradeoff that may be affect differently for other
> scenarios.
> 
> > 
> > >   This means we have more CPU to use for
> > > post processing (i.e. routing).
> > 
> > 
> > 
> > > > > Bonding is in a middle ground between application and PMD.
> > > > 
> > > > Yes.
> > > > > What bonding is doing, may not improve all applications.
> > > > 
> > > > Yes, but it can be solved using some bonding modes.
> > > > > If using a ring to buffer the vectorized receive routines,
> > > > > improves your particular application,
> > > > > that's great. 
> > > > 
> > > > It may be not great and even bad for some other PMDs which are
> > > > not vectororized.
> > > > 
> > > > > However, I don't think I can say that it would help all
> > > > > applications.  As you point out, there is overhead associated
> > > > > with
> > > > > a ring.
> > > > 
> > > > Yes.
> > > > > Bonding's receive burst isn't especially efficient (in mode
> > > > > 4).
> > > > 
> > > > Why?
> > > > 
> > > > It makes a copy of the slaves, has a fair bit of stack usage, 
> > > > needs to check the slave status, and needs to examine each
> > > > packet to see if it is a slow protocol packet.  So each
> > > > packet is essentially read twice.  The fast queue code for mode
> > > > 4
> > > > avoids some of this (and probably ignores checking collecting
> > > > incorrectly).  If you find a slow protocol packet, you need to
> > > > chop it out of the array with memmove due to overlap.
> > > 
> > > Agree.
> > > So to improve the bonding performance you need to optimize the
> > > aboves problems.
> > > There is no connection to the ring.
> > > 
> > > And as I have described numerous times, these problems
> > > can't be easily fixed and preserve the existing API.
> > 
> > Sometimes we need to work harder to see a gain for all.
> > We should not apply a patch because it is easy and show a gain for
> > specific scenarios.
> > 
> > > > > Bonding benefits from being able to read as much as possible
> > > > > (within limits of
> > > > > course, large reads would blow out caches) from each slave.
> > > > 
> > > > The slaves PMDs can benefits in the same way.
> > > > 
> > > > > It can't return all that data though because applications
> > > > > tend to use the 
> > > > > burst size that would be efficient for a typical PMD.
> > > > 
> > > > What is the preferred burst size of the bonding? Maybe the
> > > > application should use it when they are using bonding.
> > > > 
> > > > The preferred burst size for bonding would be the sum of all
> > > > the
> > > > slaves ideal read size.  However, that's not likely to be
> > > > simple
> > > > since most applications decide early the size for the
> > > > read/write
> > > > burst operations.
> > > >  
> > > > > An alternative might be to ask bonding applications to simply
> > > > > issue larger reads for
> > > > > certain modes.  That's probably not as easy as it sounds
> > > > > given the
> > > > > way that the burst length effects multiplexing.
> > > > 
> > > > Can you explain it more?
> > > > 
> > > > A longer burst size on one PMD will tend to favor that PMD
> > > > over others.  It will fill your internal queues with more 
> > > > of its packets.
> > > 
> > > Agree, it's about fairness.
> > >  
> > > > 
> > > > > Another solution might be just alternatively poll the
> > > > > individual
> > > > > slaves on each rx burst.  But that means you need to poll at
> > > > > a
> > > > > faster rate.  Depending on your application, you might not be
> > > > > able to do that.
> > > > Again, can you be more precise in the above explanation?
> > > > 
> > > > If the application knows that there are two slaves backing
> > > > a bonding interface, the application could just read twice
> > > > from the bonding interface, knowing that the bonding 
> > > > interface is going to alternate between the slaves.  But
> > > > this requires the application to know things about the bonding
> > > > PMD, like the number of slaves.
> > > 
> > > Why should the application poll twice?
> > > Poll slave 0, than process it's packets, poll slave 1 than
> > > process its packets...
> > > What is the problem?
> > > 
> > > Because let's say that each slave is 10G and you are using
> > > 
> > > link aggregation with two slaves.  If you have tuned your
> > > 
> > > application on the assumption that a PMD is approximately
> > > 10G, then you are going to be under polling the bonding PMD.
> > > For the above to work, you need to ensure that the application
> > > is polling sufficiently to keep up.
> > 
> > But each poll will be shorter, no slaves loops, only one slave
> > call.
> > 
> > But you still need to poll the bonding PMD N times as
> > fast where N is the number of slaves.  The "scheduler"
> > in the application may not be aware of that.  That was
> > (apparently) the point of the way the current bonding
> > PMD.  It hides everything from the application.
> 
> Sorry, I don't sure I understand you here.
> Also here we have tradeoff:
> Read full burst from each slave + less bonding code per burst
> against
> More bonding calls(it is not in N relation it depends in heavy and
> the burst size).
>  
> 
> > > > >   We can avoid this scheduling overhead by just
> > > > > doing the extra reads in bonding and buffering in a ring.
> > > > > 
> > > > > Since bonding is going to be buffering everything in a ring,
> > > > 
> > > > ? I don't familiar with it. For now I don't think we need a
> > > > ring.
> > > > 
> > > > We have some empirical testing that shows performance
> > > > improvements.
> > > > How do you explain this?  
> > > 
> > > You are using a hack in bonding which hurt the bonding but
> > > improves the vectorized PMD you use.
> > > 
> > > > Rings aren't inefficient.
> > > 
> > > Only when there is a need to use it.
> > > 
> > > I believe we have identified a need.
> > >  
> > > 
> > > > There's significant overhead of calling the rx burst routine on
> > > > any PMD.
> > > > You need to get the PMD data structures into local memory.
> > > 
> > > Yes.
> > > 
> > > > Reading as much as possible makes sense.
> > > 
> > > Agree.
> > > 
> > > > Could you generally do this for all PMDs?  Yes, but the ring
> > > > adds latency.  Latency
> > > > that isn't an issue for the bonding PMD because of all the
> > > > other inefficiencies (for mode 4).
> > > 
> > > Enlarging the bonding latency by that way makes some vectorized
> > > slave PMDs happy and makes the bonding worst 
> > > for other slaves PMDs - this is not a good idea to put it
> > > upstream.
> > > 
> > > Improving the other problems in the bonding(reduce the bonding
> > > latency) will do the job for you and for others.
> > > 
> > > Latency is not the issue with bonding.  The issue with bonding
> > > is the overhead associated with making an rx burst call.  We
> > > add latency (via rings) to make part of the bonding driver more
> > > efficient.
> > > 
> > > Again, I suspect it even helps the non-vectorized PMDs.
> > 
> >  >Calling a PMD's rx burst routine is costly and we are switching
> > between
> > > PMD's inside the bonding driver.  Bonding is halfway between an
> > > application and a PMD.  What we are doing in the bonding PMD is
> > > what an application would typically do.  But instead of forcing
> > > all the bonding users to do this, we are doing it for them.
> > 
> > Not agree.
> > The overhead in bonding may be critical for some other
> > PMDs\scenario.
> > 
> > The overhead is bonding is undesirable.  It's not easy to address
> > because of the initial design goals.
> >  
> > 
> > To summarize:
> > We are not agree.
> > 
> > Bottom line in my side,
> > It is not a good Idea to add overhead in the bonding PMD which is a
> > generic PMD just to get gain for some specific scenarios in some
> > specific PMDs while for other scenarios\PMDs it is bad.
> > 
> > You are making some assumptions which are simply not valid.
> > BOnding is _not_ a generic PMD.
> 
> What?
> Should the bonding be good only for specific scenarios with specific
> PMDs?
> If no, it is generic.
> 
> >   As discussed above, bonding
> > is somewhere between application and PMD.
> 
> Yes.
> 
> >   The choices made
> > for bonding where to make it easy to integrate into an existing
> > application with the application having to know anything about
> > bonding.
> 
> The bonding should be generic at least from the next perspectives:
> Different PMDs.
> Different application scenarios.
> 
> 
> 
> 
> 
> > 
> > Matan. 
> > 
> > 
> > 
> >
  
Matan Azrad Sept. 2, 2018, 11:34 a.m. UTC | #21
Hi Luca\Chas

From: Luca Boccassi
> On Wed, 2018-08-29 at 15:20 +0000, Matan Azrad wrote:
> >
> > From: Chas Williams
> > > On Tue, Aug 28, 2018 at 5:51 AM Matan Azrad <mailto:matan@mellanox.
> > > com> wrote:
> > >
> > >
> > > From: Chas Williams
> > > > On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <mailto:mailto:matan
> > > > @mellanox.com> wrote:
> > >
> > > <snip>
> > > > > > Because rings are generally quite efficient.
> > > > >
> > > > > But you are using a ring in addition to regular array
> > > > > management, it must hurt performance of the bonding PMD (means
> > > > > the bonding itself - not the slaves PMDs which are called from
> > > > > the bonding)
> > > >
> > > > It adds latency.
> > >
> > > And by that hurts the application performance because it takes more
> > > CPU time in the bonding PMD.
> > >
> > > No, as I said before it takes _less_ CPU time in the bonding PMD
> > > because we use a more optimal read from the slaves.
> >
> > Each packet pointer should be copied more 2 times because of this
> > patch + some management(the ring overhead) So in the bonding code you
> > lose performance.
> >
> > >
> > > > It increases performance because we spend less CPU time reading
> > > > from the PMDs.
> > >
> > > So, it's hack in the bonding PMD to improve some slaves code
> > > performance but hurt the bonding code performance, Over all the
> > > performance we gain for those slaves improves the application
> > > performance only when working with those slaves.
> > > But may hurt the application performance when working with other
> > > slaves.
> > >
> > > What is your evidence that is hurts bonding performance?  Your
> > > argument is purely theoretical.
> >
> > Yes, we cannot test all the scenarios cross the PMDs.
> 
> Chas has evidence that this helps, a _lot_, in some very common cases.
> We haven't seen evidence of negative impact anywhere in 2 years. Given
> this, surely it's not unreasonable to ask to substantiate theoretical arguments
> with some testing?

What is the common cases of the bond usage?
Do you really know all the variance of the bond usages spreading all over the world?

I’m saying that using a hack in the bond code which helps for some slaves PMDs\application scenarios (your common cases) but hurting
the bond code performance and latency is not the right thing to do because it may hurt other scenarios\PMDs  using the bond.
  
Chas Williams Sept. 9, 2018, 8:57 p.m. UTC | #22
On Sun, Sep 2, 2018 at 7:34 AM Matan Azrad <matan@mellanox.com> wrote:
>
> Hi Luca\Chas
>
> From: Luca Boccassi
> > On Wed, 2018-08-29 at 15:20 +0000, Matan Azrad wrote:
> > >
> > > From: Chas Williams
> > > > On Tue, Aug 28, 2018 at 5:51 AM Matan Azrad <mailto:matan@mellanox.
> > > > com> wrote:
> > > >
> > > >
> > > > From: Chas Williams
> > > > > On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad <mailto:mailto:matan
> > > > > @mellanox.com> wrote:
> > > >
> > > > <snip>
> > > > > > > Because rings are generally quite efficient.
> > > > > >
> > > > > > But you are using a ring in addition to regular array
> > > > > > management, it must hurt performance of the bonding PMD (means
> > > > > > the bonding itself - not the slaves PMDs which are called from
> > > > > > the bonding)
> > > > >
> > > > > It adds latency.
> > > >
> > > > And by that hurts the application performance because it takes more
> > > > CPU time in the bonding PMD.
> > > >
> > > > No, as I said before it takes _less_ CPU time in the bonding PMD
> > > > because we use a more optimal read from the slaves.
> > >
> > > Each packet pointer should be copied more 2 times because of this
> > > patch + some management(the ring overhead) So in the bonding code you
> > > lose performance.
> > >
> > > >
> > > > > It increases performance because we spend less CPU time reading
> > > > > from the PMDs.
> > > >
> > > > So, it's hack in the bonding PMD to improve some slaves code
> > > > performance but hurt the bonding code performance, Over all the
> > > > performance we gain for those slaves improves the application
> > > > performance only when working with those slaves.
> > > > But may hurt the application performance when working with other
> > > > slaves.
> > > >
> > > > What is your evidence that is hurts bonding performance?  Your
> > > > argument is purely theoretical.
> > >
> > > Yes, we cannot test all the scenarios cross the PMDs.
> >
> > Chas has evidence that this helps, a _lot_, in some very common cases.
> > We haven't seen evidence of negative impact anywhere in 2 years. Given
> > this, surely it's not unreasonable to ask to substantiate theoretical arguments
> > with some testing?
>
> What is the common cases of the bond usage?
> Do you really know all the variance of the bond usages spreading all over the world?

We actually have a fairly large number of deployments using this bonding code
across a couple different adapter types (mostly Intel though and some virtual
usage).  The patch was designed to address starvation of slaves because of
the way that vector receives tend on the Intel PMDs.  If there isn't
enough space
to attempt a vector receive (a minimum of 4 buffers), then the rx
burst will return
a value of 0 -- no buffers read.  The rx burst in bonding moves to the
next adapter.
So this tend to starve any slaves that aren't first in line.  The issue doesn't
really show up in single streams.  You need to run multiple streams
that multiplex
across all the slaves.

>
> I’m saying that using a hack in the bond code which helps for some slaves PMDs\application scenarios (your common cases) but hurting
> the bond code performance and latency is not the right thing to do because it may hurt other scenarios\PMDs  using the bond.

What do you think about the attached patch?  It implements an explicit
round-robin
for the "first" slave in order to enforce some sort of fairness.  Some
limited testing
has shown that this our application scan scale polling to read the
PMDs fast enough.
Note, I have only tested the 802.3ad paths.  The changes are likely
necessary for
the other RX burst routines since they should suffer the same issue.
  
Matan Azrad Sept. 12, 2018, 5:38 a.m. UTC | #23
Hi Chas

From: Chas Williams
> On Sun, Sep 2, 2018 at 7:34 AM Matan Azrad <matan@mellanox.com> wrote:
> >
> > Hi Luca\Chas
> >
> > From: Luca Boccassi
> > > On Wed, 2018-08-29 at 15:20 +0000, Matan Azrad wrote:
> > > >
> > > > From: Chas Williams
> > > > > On Tue, Aug 28, 2018 at 5:51 AM Matan Azrad
> <mailto:matan@mellanox.
> > > > > com> wrote:
> > > > >
> > > > >
> > > > > From: Chas Williams
> > > > > > On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad
> > > > > > <mailto:mailto:matan @mellanox.com> wrote:
> > > > >
> > > > > <snip>
> > > > > > > > Because rings are generally quite efficient.
> > > > > > >
> > > > > > > But you are using a ring in addition to regular array
> > > > > > > management, it must hurt performance of the bonding PMD
> > > > > > > (means the bonding itself - not the slaves PMDs which are
> > > > > > > called from the bonding)
> > > > > >
> > > > > > It adds latency.
> > > > >
> > > > > And by that hurts the application performance because it takes
> > > > > more CPU time in the bonding PMD.
> > > > >
> > > > > No, as I said before it takes _less_ CPU time in the bonding PMD
> > > > > because we use a more optimal read from the slaves.
> > > >
> > > > Each packet pointer should be copied more 2 times because of this
> > > > patch + some management(the ring overhead) So in the bonding code
> > > > you lose performance.
> > > >
> > > > >
> > > > > > It increases performance because we spend less CPU time
> > > > > > reading from the PMDs.
> > > > >
> > > > > So, it's hack in the bonding PMD to improve some slaves code
> > > > > performance but hurt the bonding code performance, Over all the
> > > > > performance we gain for those slaves improves the application
> > > > > performance only when working with those slaves.
> > > > > But may hurt the application performance when working with other
> > > > > slaves.
> > > > >
> > > > > What is your evidence that is hurts bonding performance?  Your
> > > > > argument is purely theoretical.
> > > >
> > > > Yes, we cannot test all the scenarios cross the PMDs.
> > >
> > > Chas has evidence that this helps, a _lot_, in some very common cases.
> > > We haven't seen evidence of negative impact anywhere in 2 years.
> > > Given this, surely it's not unreasonable to ask to substantiate
> > > theoretical arguments with some testing?
> >
> > What is the common cases of the bond usage?
> > Do you really know all the variance of the bond usages spreading all over
> the world?
> 
> We actually have a fairly large number of deployments using this bonding
> code across a couple different adapter types (mostly Intel though and some
> virtual usage).  The patch was designed to address starvation of slaves
> because of the way that vector receives tend on the Intel PMDs.  If there
> isn't enough space to attempt a vector receive (a minimum of 4 buffers),
> then the rx burst will return a value of 0 -- no buffers read.  The rx burst in
> bonding moves to the next adapter.
> So this tend to starve any slaves that aren't first in line.  The issue doesn't
> really show up in single streams.  You need to run multiple streams that
> multiplex across all the slaves.
> 
> >
> > I’m saying that using a hack in the bond code which helps for some
> > slaves PMDs\application scenarios (your common cases) but hurting the
> bond code performance and latency is not the right thing to do because it
> may hurt other scenarios\PMDs  using the bond.
> 
> What do you think about the attached patch?  It implements an explicit
> round-robin for the "first" slave in order to enforce some sort of fairness.
> Some limited testing has shown that this our application scan scale polling to
> read the PMDs fast enough.
> Note, I have only tested the 802.3ad paths.  The changes are likely necessary
> for the other RX burst routines since they should suffer the same issue.

The attached patch idea looks fair to me, no performance\latency impact on the bond code with an improvement for some scenarios.
The fairness between slaves is still there.
But please don't use % in the datapath code.

Matan.
  
Luca Boccassi Sept. 19, 2018, 6:09 p.m. UTC | #24
On Thu, 2018-08-16 at 14:32 +0100, Luca Boccassi wrote:
> During bond 802.3ad receive, a burst of packets is fetched from
> each slave into a local array and appended to per-slave ring buffer.
> Packets are taken from the head of the ring buffer and returned to
> the caller.  The number of mbufs provided to each slave is sufficient
> to meet the requirements of the ixgbe vector receive.
> 
> This small change improves performances of the bonding driver
> considerably. Vyatta has been using it for years in production.
> 
> Cc: stable@dpdk.org
> 
> Signed-off-by: Eric Kinzie <ekinzie@brocade.com>
> Signed-off-by: Luca Boccassi <bluca@debian.org>
> ---
> v2 and v3: fix checkpatch warnings
> v4: add Eric's original signed-off-by from the Vyatta internal repo

Superseded by: https://patches.dpdk.org/patch/44956/
  

Patch

diff --git a/drivers/net/bonding/rte_eth_bond_api.c b/drivers/net/bonding/rte_eth_bond_api.c
index 8bc04cfd11..3d22203e91 100644
--- a/drivers/net/bonding/rte_eth_bond_api.c
+++ b/drivers/net/bonding/rte_eth_bond_api.c
@@ -524,6 +524,10 @@  __eth_bond_slave_remove_lock_free(uint16_t bonded_port_id,
 					sizeof(*(rte_eth_devices[bonded_port_id].data->mac_addrs)));
 	}
 	if (internals->slave_count == 0) {
+		/* Remove any remaining packets in the receive ring */
+		struct rte_mbuf *bufs[PMD_BOND_RECV_PKTS_PER_SLAVE];
+		unsigned int j, count;
+
 		internals->rx_offload_capa = 0;
 		internals->tx_offload_capa = 0;
 		internals->rx_queue_offload_capa = 0;
@@ -532,6 +536,15 @@  __eth_bond_slave_remove_lock_free(uint16_t bonded_port_id,
 		internals->reta_size = 0;
 		internals->candidate_max_rx_pktlen = 0;
 		internals->max_rx_pktlen = 0;
+
+		do {
+			count = rte_ring_dequeue_burst(internals->rx_ring,
+					(void **)bufs,
+					PMD_BOND_RECV_PKTS_PER_SLAVE,
+					NULL);
+			for  (j = 0; j < count; j++)
+				rte_pktmbuf_free(bufs[j]);
+		} while (count > 0);
 	}
 	return 0;
 }
diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c
index 58f7377c60..ae756c4e3a 100644
--- a/drivers/net/bonding/rte_eth_bond_pmd.c
+++ b/drivers/net/bonding/rte_eth_bond_pmd.c
@@ -18,6 +18,8 @@ 
 #include <rte_alarm.h>
 #include <rte_cycles.h>
 #include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
 
 #include "rte_eth_bond.h"
 #include "rte_eth_bond_private.h"
@@ -402,10 +404,15 @@  bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
 	struct bond_dev_private *internals = bd_rx_q->dev_private;
 	struct ether_addr bond_mac;
 
+	unsigned int rx_ring_avail = rte_ring_free_count(internals->rx_ring);
+	struct rte_mbuf *mbuf_bounce[PMD_BOND_RECV_PKTS_PER_SLAVE];
+
 	struct ether_hdr *hdr;
 
 	const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
 	uint16_t num_rx_total = 0;	/* Total number of received packets */
+	uint16_t num_rx_slave;
+	uint16_t num_enq_slave;
 	uint16_t slaves[RTE_MAX_ETHPORTS];
 	uint16_t slave_count, idx;
 
@@ -414,6 +421,9 @@  bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
 	uint8_t i, j, k;
 	uint8_t subtype;
 
+	if (rx_ring_avail < PMD_BOND_RECV_PKTS_PER_SLAVE)
+		goto dequeue;
+
 	rte_eth_macaddr_get(internals->port_id, &bond_mac);
 	/* Copy slave list to protect against slave up/down changes during tx
 	 * bursting */
@@ -426,62 +436,96 @@  bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
 		internals->active_slave = 0;
 		idx = 0;
 	}
-	for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
-		j = num_rx_total;
+	for (i = 0; i < slave_count && num_rx_total < rx_ring_avail; i++) {
+		j = 0;
 		collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
 					 COLLECTING);
 
 		/* Read packets from this slave */
-		num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
-				&bufs[num_rx_total], nb_pkts - num_rx_total);
+		if (unlikely(rx_ring_avail - num_rx_total <
+				PMD_BOND_RECV_PKTS_PER_SLAVE))
+			continue;
+		num_rx_slave = rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
+				mbuf_bounce, PMD_BOND_RECV_PKTS_PER_SLAVE);
 
-		for (k = j; k < 2 && k < num_rx_total; k++)
-			rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
+		for (k = j; k < 2 && k < num_rx_slave; k++)
+			rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[k], void *));
 
 		/* Handle slow protocol packets. */
-		while (j < num_rx_total) {
+		while (j < num_rx_slave) {
 
 			/* If packet is not pure L2 and is known, skip it */
-			if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) {
+			if ((mbuf_bounce[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) {
 				j++;
 				continue;
 			}
 
-			if (j + 3 < num_rx_total)
-				rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
+			if (j + 3 < num_rx_slave)
+				rte_prefetch0(rte_pktmbuf_mtod(mbuf_bounce[j + 3],
+							       void *));
 
-			hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
+			hdr = rte_pktmbuf_mtod(mbuf_bounce[j],
+					       struct ether_hdr *);
 			subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
 
 			/* Remove packet from array if it is slow packet or slave is not
 			 * in collecting state or bonding interface is not in promiscuous
 			 * mode and packet address does not match. */
-			if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]) ||
+			if (unlikely(is_lacp_packets(hdr->ether_type,
+						     subtype, mbuf_bounce[j]) ||
 				!collecting || (!promisc &&
 					!is_multicast_ether_addr(&hdr->d_addr) &&
 					!is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
 
 				if (hdr->ether_type == ether_type_slow_be) {
 					bond_mode_8023ad_handle_slow_pkt(
-					    internals, slaves[idx], bufs[j]);
+					    internals, slaves[idx],
+					    mbuf_bounce[j]);
 				} else
-					rte_pktmbuf_free(bufs[j]);
+					rte_pktmbuf_free(mbuf_bounce[j]);
 
 				/* Packet is managed by mode 4 or dropped, shift the array */
-				num_rx_total--;
-				if (j < num_rx_total) {
-					memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
-						(num_rx_total - j));
+				num_rx_slave--;
+				if (j < num_rx_slave) {
+					memmove(&mbuf_bounce[j],
+						&mbuf_bounce[j + 1],
+						sizeof(mbuf_bounce[0]) *
+						(num_rx_slave - j));
 				}
-			} else
+			} else {
 				j++;
+			}
 		}
+
+		if (num_rx_slave > 0) {
+			if (mbuf_bounce[0] == NULL)
+				RTE_LOG(ERR, PMD, "%s: Enqueue a NULL??\n",
+					__func__);
+
+			num_enq_slave = rte_ring_enqueue_burst(internals->rx_ring,
+							       (void **)mbuf_bounce,
+							       num_rx_slave,
+							       NULL);
+
+			if (num_enq_slave < num_rx_slave) {
+				RTE_LOG(ERR, PMD,
+					"%s: failed to enqueue %u packets",
+					__func__,
+					(num_rx_slave - num_enq_slave));
+				for (j = num_enq_slave; j < num_rx_slave; j++)
+					rte_pktmbuf_free(mbuf_bounce[j]);
+			}
+			num_rx_total += num_enq_slave;
+		}
+
 		if (unlikely(++idx == slave_count))
 			idx = 0;
 	}
 
 	internals->active_slave = idx;
-	return num_rx_total;
+dequeue:
+	return rte_ring_dequeue_burst(internals->rx_ring, (void **)bufs,
+				      nb_pkts, NULL);
 }
 
 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
@@ -3065,6 +3109,7 @@  bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
 	struct bond_dev_private *internals = NULL;
 	struct rte_eth_dev *eth_dev = NULL;
 	uint32_t vlan_filter_bmp_size;
+	char mem_name[RTE_ETH_NAME_MAX_LEN];
 
 	/* now do all data allocation - for eth_dev structure, dummy pci driver
 	 * and internal (private) data
@@ -3129,6 +3174,19 @@  bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
 	TAILQ_INIT(&internals->flow_list);
 	internals->flow_isolated_valid = 0;
 
+	snprintf(mem_name, RTE_DIM(mem_name), "bond_%u_rx", internals->port_id);
+	internals->rx_ring = rte_ring_lookup(mem_name);
+	if (internals->rx_ring == NULL) {
+		internals->rx_ring = rte_ring_create(mem_name,
+				     rte_align32pow2(PMD_BOND_RECV_RING_PKTS *
+						     rte_lcore_count()),
+				     socket_id, 0);
+		if (internals->rx_ring == NULL)
+			rte_panic("%s: Failed to create rx ring '%s': %s\n", name,
+				  mem_name, rte_strerror(rte_errno));
+	}
+
+
 	/* Set mode 4 default configuration */
 	bond_mode_8023ad_setup(eth_dev, NULL);
 	if (bond_ethdev_mode_set(eth_dev, mode)) {
diff --git a/drivers/net/bonding/rte_eth_bond_private.h b/drivers/net/bonding/rte_eth_bond_private.h
index 43e0e448df..80261c6b14 100644
--- a/drivers/net/bonding/rte_eth_bond_private.h
+++ b/drivers/net/bonding/rte_eth_bond_private.h
@@ -26,6 +26,8 @@ 
 #define PMD_BOND_LSC_POLL_PERIOD_KVARG		("lsc_poll_period_ms")
 #define PMD_BOND_LINK_UP_PROP_DELAY_KVARG	("up_delay")
 #define PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG	("down_delay")
+#define PMD_BOND_RECV_RING_PKTS		512
+#define PMD_BOND_RECV_PKTS_PER_SLAVE		32
 
 #define PMD_BOND_XMIT_POLICY_LAYER2_KVARG	("l2")
 #define PMD_BOND_XMIT_POLICY_LAYER23_KVARG	("l23")
@@ -175,6 +177,8 @@  struct bond_dev_private {
 
 	void *vlan_filter_bmpmem;		/* enabled vlan filter bitmap */
 	struct rte_bitmap *vlan_filter_bmp;
+
+	struct rte_ring *rx_ring;
 };
 
 extern const struct eth_dev_ops default_dev_ops;