@@ -612,6 +612,14 @@ Limitations
- When using DV/verbs flow engine (``dv_flow_en`` = 1/0 respectively), Match on SPI field
in ESP header for group 0 needs MLNX_OFED 5.6+.
+- During live migration to a new process set its flow engine as standby mode,
+ the user should only program flow rules in group 0 (``fdb_def_rule_en=0``).
+ Live migration is only supported under SWS (``dv_flow_en=1``).
+ The flow group 0 is shared between DPDK processes
+ while the other flow groups are limited to the current process.
+ The flow engine of a process cannot move from active to standby mode
+ if preceding active application rules are still present and vice versa.
+
Statistics
----------
@@ -33,6 +33,7 @@
#include "mlx5_utils.h"
#include "mlx5_os.h"
#include "mlx5_autoconf.h"
+#include "rte_pmd_mlx5.h"
#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
#ifndef RTE_EXEC_ENV_WINDOWS
#define HAVE_MLX5_HWS_SUPPORT 1
@@ -1656,6 +1657,28 @@ struct mlx5_hw_ctrl_flow {
struct rte_flow *flow;
};
+/*
+ * Flow rule structure for flow engine mode control, focus on group 0.
+ * Apply to all supported domains.
+ */
+struct mlx5_dv_flow_info {
+ LIST_ENTRY(mlx5_dv_flow_info) next;
+ uint32_t orig_prio; /* prio set by user */
+ uint32_t flow_idx_high_prio;
+ /* flow index owned by standby mode. priority is lower unless DUP flags. */
+ uint32_t flow_idx_low_prio;
+ struct rte_flow_item *items;
+ struct rte_flow_action *actions;
+ struct rte_flow_attr attr;
+};
+
+struct mlx5_flow_engine_mode_info {
+ enum mlx5_flow_engine_mode mode;
+ uint32_t mode_flag;
+ /* The list is maintained in insertion order. */
+ LIST_HEAD(hot_up_info, mlx5_dv_flow_info) hot_upgrade;
+};
+
struct mlx5_flow_hw_ctrl_rx;
struct mlx5_priv {
@@ -1763,6 +1786,7 @@ struct mlx5_priv {
uint32_t nb_queue; /* HW steering queue number. */
struct mlx5_hws_cnt_pool *hws_cpool; /* HW steering's counter pool. */
uint32_t hws_mark_refcnt; /* HWS mark action reference counter. */
+ struct mlx5_flow_engine_mode_info mode_info; /* Process set flow engine info. */
#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
/* Item template list. */
LIST_HEAD(flow_hw_itt, rte_flow_pattern_template) flow_hw_itt;
@@ -164,6 +164,16 @@ mlx5_flow_expand_rss_adjust_node(const struct rte_flow_item *pattern,
const struct mlx5_flow_expand_node graph[],
const struct mlx5_flow_expand_node *node);
+static __rte_always_inline int
+mlx5_need_cache_flow(const struct mlx5_priv *priv,
+ const struct rte_flow_attr *attr)
+{
+ return priv->isolated && priv->sh->config.dv_flow_en == 1 &&
+ (attr ? !attr->group : true) &&
+ priv->mode_info.mode == MLX5_FLOW_ENGINE_MODE_STANDBY &&
+ (!priv->sh->config.dv_esw_en || !priv->sh->config.fdb_def_rule);
+}
+
static bool
mlx5_flow_is_rss_expandable_item(const struct rte_flow_item *item)
{
@@ -7477,6 +7487,254 @@ mlx5_flow_validate(struct rte_eth_dev *dev,
return ret;
}
+static int
+mlx5_flow_cache_flow_info(struct rte_eth_dev *dev,
+ const struct rte_flow_attr *attr,
+ const uint32_t orig_prio,
+ const struct rte_flow_item *items,
+ const struct rte_flow_action *actions,
+ uint32_t flow_idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+ struct mlx5_dv_flow_info *flow_info, *tmp_info;
+ struct rte_flow_error error;
+ int len, ret;
+
+ flow_info = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*flow_info), 0, SOCKET_ID_ANY);
+ if (!flow_info) {
+ DRV_LOG(ERR, "No enough memory for flow_info caching.");
+ return -1;
+ }
+ flow_info->orig_prio = orig_prio;
+ flow_info->attr = *attr;
+ /* Standby mode rule awlays saves it in low priority entry. */
+ flow_info->flow_idx_low_prio = flow_idx;
+
+ /* Store matching items. */
+ ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN, NULL, 0, items, &error);
+ if (ret <= 0) {
+ DRV_LOG(ERR, "Can't get items length.");
+ goto end;
+ }
+ len = RTE_ALIGN(ret, 16);
+ flow_info->items = mlx5_malloc(MLX5_MEM_ZERO, len, 0, SOCKET_ID_ANY);
+ if (!flow_info->items) {
+ DRV_LOG(ERR, "No enough memory for items caching.");
+ goto end;
+ }
+ ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN, flow_info->items, ret, items, &error);
+ if (ret <= 0) {
+ DRV_LOG(ERR, "Can't duplicate items.");
+ goto end;
+ }
+
+ /* Store flow actions. */
+ ret = rte_flow_conv(RTE_FLOW_CONV_OP_ACTIONS, NULL, 0, actions, &error);
+ if (ret <= 0) {
+ DRV_LOG(ERR, "Can't get actions length.");
+ goto end;
+ }
+ len = RTE_ALIGN(ret, 16);
+ flow_info->actions = mlx5_malloc(MLX5_MEM_ZERO, len, 0, SOCKET_ID_ANY);
+ if (!flow_info->actions) {
+ DRV_LOG(ERR, "No enough memory for actions caching.");
+ goto end;
+ }
+ ret = rte_flow_conv(RTE_FLOW_CONV_OP_ACTIONS, flow_info->actions, ret, actions, &error);
+ if (ret <= 0) {
+ DRV_LOG(ERR, "Can't duplicate actions.");
+ goto end;
+ }
+
+ /* Insert to the list end. */
+ if (LIST_EMPTY(&mode_info->hot_upgrade)) {
+ LIST_INSERT_HEAD(&mode_info->hot_upgrade, flow_info, next);
+ } else {
+ tmp_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (LIST_NEXT(tmp_info, next))
+ tmp_info = LIST_NEXT(tmp_info, next);
+ LIST_INSERT_AFTER(tmp_info, flow_info, next);
+ }
+ return 0;
+end:
+ if (flow_info->items)
+ mlx5_free(flow_info->items);
+ if (flow_info->actions)
+ mlx5_free(flow_info->actions);
+ mlx5_free(flow_info);
+ return -1;
+}
+
+static int
+mlx5_flow_cache_flow_toggle(struct rte_eth_dev *dev, bool orig_prio)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+ struct mlx5_dv_flow_info *flow_info;
+ struct rte_flow_attr attr;
+ struct rte_flow_error error;
+ struct rte_flow *high, *low;
+
+ flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (flow_info) {
+ /* DUP flow may have the same priority. */
+ if (flow_info->orig_prio != flow_info->attr.priority) {
+ attr = flow_info->attr;
+ if (orig_prio)
+ attr.priority = flow_info->orig_prio;
+ flow_info->flow_idx_high_prio = flow_list_create(dev, MLX5_FLOW_TYPE_GEN,
+ &attr, flow_info->items, flow_info->actions,
+ true, &error);
+ if (!flow_info->flow_idx_high_prio) {
+ DRV_LOG(ERR, "Priority toggle failed internally.");
+ goto err;
+ }
+ }
+ flow_info = LIST_NEXT(flow_info, next);
+ }
+ /* Delete the low priority rules and swap the flow handle. */
+ flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (flow_info) {
+ MLX5_ASSERT(flow_info->flow_idx_low_prio);
+ if (flow_info->orig_prio != flow_info->attr.priority) {
+ high = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
+ flow_info->flow_idx_high_prio);
+ low = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
+ flow_info->flow_idx_low_prio);
+ if (high && low) {
+ RTE_SWAP(*low, *high);
+ flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN,
+ flow_info->flow_idx_low_prio);
+ flow_info->flow_idx_high_prio = 0;
+ }
+ }
+ flow_info = LIST_NEXT(flow_info, next);
+ }
+ return 0;
+err:
+ /* Destroy preceding successful high priority rules. */
+ flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (flow_info) {
+ if (flow_info->orig_prio != flow_info->attr.priority) {
+ if (flow_info->flow_idx_high_prio)
+ flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN,
+ flow_info->flow_idx_high_prio);
+ else
+ break;
+ flow_info->flow_idx_high_prio = 0;
+ }
+ flow_info = LIST_NEXT(flow_info, next);
+ }
+ return -1;
+}
+
+/**
+ * Set the mode of the flow engine of a process to active or standby during live migration.
+ *
+ * @param[in] mode
+ * MLX5 flow engine mode, @see `enum mlx5_flow_engine_mode`.
+ * @param[in] flags
+ * Flow engine mode specific flags.
+ *
+ * @return
+ * Negative value on error, positive on success.
+ */
+int
+rte_pmd_mlx5_flow_engine_set_mode(enum mlx5_flow_engine_mode mode, uint32_t flags)
+{
+ struct mlx5_priv *priv;
+ struct mlx5_flow_engine_mode_info *mode_info;
+ struct mlx5_dv_flow_info *flow_info, *tmp_info;
+ uint16_t port, port_id;
+ uint16_t toggle_num = 0;
+ struct rte_eth_dev *dev;
+ enum mlx5_flow_engine_mode orig_mode;
+ uint32_t orig_flags;
+ bool need_toggle = false;
+
+ /* Check if flags combinations are supported. */
+ if (flags && flags != MLX5_FLOW_ENGINE_FLAG_STANDBY_DUP_INGRESS) {
+ DRV_LOG(ERR, "Doesn't support such flags %u", flags);
+ return -1;
+ }
+ MLX5_ETH_FOREACH_DEV(port, NULL) {
+ dev = &rte_eth_devices[port];
+ priv = dev->data->dev_private;
+ mode_info = &priv->mode_info;
+ /* No mode change. Assume all devices hold the same mode. */
+ if (mode_info->mode == mode) {
+ DRV_LOG(INFO, "Process flow engine has been in mode %u", mode);
+ if (mode_info->mode_flag != flags && !LIST_EMPTY(&mode_info->hot_upgrade)) {
+ DRV_LOG(ERR, "Port %u has rule cache with different flag %u\n",
+ port, mode_info->mode_flag);
+ orig_mode = mode_info->mode;
+ orig_flags = mode_info->mode_flag;
+ goto err;
+ }
+ mode_info->mode_flag = flags;
+ toggle_num++;
+ continue;
+ }
+ /* Active -> standby. */
+ if (mode == MLX5_FLOW_ENGINE_MODE_STANDBY) {
+ if (!LIST_EMPTY(&mode_info->hot_upgrade)) {
+ DRV_LOG(ERR, "Cached rule existed");
+ orig_mode = mode_info->mode;
+ orig_flags = mode_info->mode_flag;
+ goto err;
+ }
+ mode_info->mode_flag = flags;
+ mode_info->mode = mode;
+ toggle_num++;
+ /* Standby -> active. */
+ } else if (mode == MLX5_FLOW_ENGINE_MODE_ACTIVE) {
+ if (LIST_EMPTY(&mode_info->hot_upgrade)) {
+ DRV_LOG(INFO, "No cached rule existed");
+ } else {
+ if (mlx5_flow_cache_flow_toggle(dev, true)) {
+ orig_mode = mode_info->mode;
+ orig_flags = mode_info->mode_flag;
+ need_toggle = true;
+ goto err;
+ }
+ }
+ toggle_num++;
+ }
+ }
+ if (mode == MLX5_FLOW_ENGINE_MODE_ACTIVE) {
+ /* Clear cache flow rules. */
+ MLX5_ETH_FOREACH_DEV(port, NULL) {
+ priv = rte_eth_devices[port].data->dev_private;
+ mode_info = &priv->mode_info;
+ flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (flow_info) {
+ tmp_info = LIST_NEXT(flow_info, next);
+ LIST_REMOVE(flow_info, next);
+ mlx5_free(flow_info->actions);
+ mlx5_free(flow_info->items);
+ mlx5_free(flow_info);
+ flow_info = tmp_info;
+ }
+ MLX5_ASSERT(LIST_EMPTY(&mode_info->hot_upgrade));
+ }
+ }
+ return toggle_num;
+err:
+ /* Rollback all preceding successful ports. */
+ MLX5_ETH_FOREACH_DEV(port_id, NULL) {
+ if (port_id == port)
+ break;
+ priv = rte_eth_devices[port_id].data->dev_private;
+ mode_info = &priv->mode_info;
+ if (need_toggle && !LIST_EMPTY(&mode_info->hot_upgrade) &&
+ mlx5_flow_cache_flow_toggle(dev, false))
+ return -EPERM;
+ mode_info->mode = orig_mode;
+ mode_info->mode_flag = orig_flags;
+ }
+ return -EINVAL;
+}
/**
* Create a flow.
*
@@ -7491,6 +7749,9 @@ mlx5_flow_create(struct rte_eth_dev *dev,
struct rte_flow_error *error)
{
struct mlx5_priv *priv = dev->data->dev_private;
+ struct rte_flow_attr *new_attr = (void *)(uintptr_t)attr;
+ uint32_t prio = attr->priority;
+ uint32_t flow_idx;
if (priv->sh->config.dv_flow_en == 2) {
rte_flow_error_set(error, ENOTSUP,
@@ -7513,10 +7774,22 @@ mlx5_flow_create(struct rte_eth_dev *dev,
"port not started");
return NULL;
}
-
- return (void *)(uintptr_t)flow_list_create(dev, MLX5_FLOW_TYPE_GEN,
- attr, items, actions,
- true, error);
+ if (unlikely(mlx5_need_cache_flow(priv, attr))) {
+ if (attr->transfer ||
+ (attr->ingress &&
+ !(priv->mode_info.mode_flag & MLX5_FLOW_ENGINE_FLAG_STANDBY_DUP_INGRESS)))
+ new_attr->priority += 1;
+ }
+ flow_idx = flow_list_create(dev, MLX5_FLOW_TYPE_GEN, attr, items, actions, true, error);
+ if (!flow_idx)
+ return NULL;
+ if (unlikely(mlx5_need_cache_flow(priv, attr))) {
+ if (mlx5_flow_cache_flow_info(dev, attr, prio, items, actions, flow_idx)) {
+ flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN, flow_idx);
+ flow_idx = 0;
+ }
+ }
+ return (void *)(uintptr_t)flow_idx;
}
/**
@@ -7573,6 +7846,8 @@ mlx5_flow_list_flush(struct rte_eth_dev *dev, enum mlx5_flow_type type,
struct mlx5_priv *priv = dev->data->dev_private;
uint32_t num_flushed = 0, fidx = 1;
struct rte_flow *flow;
+ struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+ struct mlx5_dv_flow_info *flow_info;
#ifdef HAVE_IBV_FLOW_DV_SUPPORT
if (priv->sh->config.dv_flow_en == 2 &&
@@ -7584,6 +7859,21 @@ mlx5_flow_list_flush(struct rte_eth_dev *dev, enum mlx5_flow_type type,
MLX5_IPOOL_FOREACH(priv->flows[type], fidx, flow) {
flow_list_destroy(dev, type, fidx);
+ if (unlikely(mlx5_need_cache_flow(priv, NULL) && type == MLX5_FLOW_TYPE_GEN)) {
+ flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (flow_info) {
+ /* Romove the cache flow info. */
+ if (flow_info->flow_idx_low_prio == (uint32_t)(uintptr_t)fidx) {
+ MLX5_ASSERT(!flow_info->flow_idx_high_prio);
+ LIST_REMOVE(flow_info, next);
+ mlx5_free(flow_info->items);
+ mlx5_free(flow_info->actions);
+ mlx5_free(flow_info);
+ break;
+ }
+ flow_info = LIST_NEXT(flow_info, next);
+ }
+ }
num_flushed++;
}
if (active) {
@@ -8032,6 +8322,8 @@ mlx5_flow_destroy(struct rte_eth_dev *dev,
struct rte_flow_error *error __rte_unused)
{
struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+ struct mlx5_dv_flow_info *flow_info;
if (priv->sh->config.dv_flow_en == 2)
return rte_flow_error_set(error, ENOTSUP,
@@ -8040,6 +8332,21 @@ mlx5_flow_destroy(struct rte_eth_dev *dev,
"Flow non-Q destruction not supported");
flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN,
(uintptr_t)(void *)flow);
+ if (unlikely(mlx5_need_cache_flow(priv, NULL))) {
+ flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+ while (flow_info) {
+ /* Romove the cache flow info. */
+ if (flow_info->flow_idx_low_prio == (uint32_t)(uintptr_t)flow) {
+ MLX5_ASSERT(!flow_info->flow_idx_high_prio);
+ LIST_REMOVE(flow_info, next);
+ mlx5_free(flow_info->items);
+ mlx5_free(flow_info->actions);
+ mlx5_free(flow_info);
+ break;
+ }
+ flow_info = LIST_NEXT(flow_info, next);
+ }
+ }
return 0;
}
@@ -158,6 +158,72 @@ int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate, uint32_t flags);
__rte_experimental
int rte_pmd_mlx5_external_sq_enable(uint16_t port_id, uint32_t sq_num);
+/* MLX5 flow engine mode definition for live migration. */
+enum mlx5_flow_engine_mode {
+ MLX5_FLOW_ENGINE_MODE_ACTIVE, /* active means high priority, effective in HW. */
+ MLX5_FLOW_ENGINE_MODE_STANDBY, /* standby mode with lower priority flow rules. */
+};
+
+/**
+ * When set on the flow engine of a standby process, ingress flow rules will be effective
+ * in active and standby processes, so the ingress traffic may be duplicated.
+ */
+#define MLX5_FLOW_ENGINE_FLAG_STANDBY_DUP_INGRESS RTE_BIT32(0)
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Set the flow engine mode of the process to active or standby,
+ * affecting network traffic handling.
+ *
+ * If one device does not support this operation or fails,
+ * the whole operation is failed and rolled back.
+ *
+ * It is forbidden to have multiple flow engines with the same mode
+ * unless only one of them is configured to handle the traffic.
+ *
+ * The application's flow engine is active by default.
+ * The configuration from the active flow engine is effective immediately
+ * while the configuration from the standby flow engine is queued by hardware.
+ * When configuring the device from a standby flow engine,
+ * it has no effect except for below situations:
+ * - traffic not handled by the active flow engine configuration
+ * - no active flow engine
+ *
+ * When flow engine of a process is changed from a standby to an active mode,
+ * all preceding configurations that are queued by hardware
+ * should become effective immediately.
+ * Before mode transition, all the traffic handling configurations
+ * set by the active flow engine should be flushed first.
+ *
+ * In summary, the operations are expected to happen in this order
+ * in "old" and "new" applications:
+ * device: already configured by the old application
+ * new: start as active
+ * new: probe the same device
+ * new: set as standby
+ * new: configure the device
+ * device: has configurations from old and new applications
+ * old: clear its device configuration
+ * device: has only 1 configuration from new application
+ * new: set as active
+ * device: downtime for connecting all to the new application
+ * old: shutdown
+ *
+ * @param mode
+ * The desired mode `mlx5_flow_engine_mode`.
+ * @param flags
+ * Mode specific flags.
+ * @return
+ * Positive value on success, -rte_errno value on error:
+ * - (> 0) Number of switched devices.
+ * - (-EINVAL) if error happen and rollback internally.
+ * - (-EPERM) if operation failed and can't recover.
+ */
+__rte_experimental
+int rte_pmd_mlx5_flow_engine_set_mode(enum mlx5_flow_engine_mode mode, uint32_t flags);
+
#ifdef __cplusplus
}
#endif
@@ -15,4 +15,6 @@ EXPERIMENTAL {
# added in 22.07
rte_pmd_mlx5_host_shaper_config;
rte_pmd_mlx5_external_sq_enable;
+ # added in 23.03
+ rte_pmd_mlx5_flow_engine_set_mode;
};