[v7] net/e1000: fix i219 hang on reset/close

Message ID 1563797960-58560-1-git-send-email-xiao.zhang@intel.com
State Superseded, archived
Delegated to: Qi Zhang
Headers show
Series
  • [v7] net/e1000: fix i219 hang on reset/close
Related show

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Xiao Zhang July 22, 2019, 12:19 p.m.
Unit hang may occur if multiple descriptors are available in the rings
during reset or close. This state can be detected by configure status
by bit 8 in register. If the bit is set and there are pending
descriptors in one of the rings, we must flush them before reset or
close.

Fixes: 80580344("e1000: support EM devices (also known as e1000/e1000e)")
Cc: stable@dpdk.org

Signed-off-by: Xiao Zhang <xiao.zhang@intel.com>
---
v7 Add fix line.
v6 Change the fix on em driver instead of igb driver and update the 
register address according to C-Spec.
v5 Change the subject.
v4 Correct the tail descriptor of tx ring.
v3 Add loop to handle all tx and rx queues.
v2 Use configuration register instead of NVM7 to get the hang state.
---
 drivers/net/e1000/e1000_ethdev.h |   4 ++
 drivers/net/e1000/em_ethdev.c    |   5 ++
 drivers/net/e1000/em_rxtx.c      | 108 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+)

Comments

Ye Xiaolong July 22, 2019, 12:34 p.m. | #1
On 07/22, Xiao Zhang wrote:
>Unit hang may occur if multiple descriptors are available in the rings
>during reset or close. This state can be detected by configure status
>by bit 8 in register. If the bit is set and there are pending
>descriptors in one of the rings, we must flush them before reset or
>close.
>
>Fixes: 80580344("e1000: support EM devices (also known as e1000/e1000e)")
>Cc: stable@dpdk.org
>
>Signed-off-by: Xiao Zhang <xiao.zhang@intel.com>
>---
>v7 Add fix line.
>v6 Change the fix on em driver instead of igb driver and update the 
>register address according to C-Spec.
>v5 Change the subject.
>v4 Correct the tail descriptor of tx ring.
>v3 Add loop to handle all tx and rx queues.
>v2 Use configuration register instead of NVM7 to get the hang state.
>---
> drivers/net/e1000/e1000_ethdev.h |   4 ++
> drivers/net/e1000/em_ethdev.c    |   5 ++
> drivers/net/e1000/em_rxtx.c      | 108 +++++++++++++++++++++++++++++++++++++++
> 3 files changed, 117 insertions(+)
>
>diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
>index 67acb73..01ff943 100644
>--- a/drivers/net/e1000/e1000_ethdev.h
>+++ b/drivers/net/e1000/e1000_ethdev.h
>@@ -35,6 +35,9 @@
> #define IGB_MAX_RX_QUEUE_NUM           8
> #define IGB_MAX_RX_QUEUE_NUM_82576     16
> 
>+#define E1000_I219_MAX_RX_QUEUE_NUM		2
>+#define E1000_I219_MAX_TX_QUEUE_NUM		2
>+
> #define E1000_SYN_FILTER_ENABLE        0x00000001 /* syn filter enable field */
> #define E1000_SYN_FILTER_QUEUE         0x0000000E /* syn filter queue field */
> #define E1000_SYN_FILTER_QUEUE_SHIFT   1          /* syn filter queue field */
>@@ -522,5 +525,6 @@ int igb_action_rss_same(const struct rte_flow_action_rss *comp,
> int igb_config_rss_filter(struct rte_eth_dev *dev,
> 			struct igb_rte_flow_rss_conf *conf,
> 			bool add);
>+void em_flush_desc_rings(struct rte_eth_dev *dev);
> 
> #endif /* _E1000_ETHDEV_H_ */
>diff --git a/drivers/net/e1000/em_ethdev.c b/drivers/net/e1000/em_ethdev.c
>index dc88661..62d3a95 100644
>--- a/drivers/net/e1000/em_ethdev.c
>+++ b/drivers/net/e1000/em_ethdev.c
>@@ -738,6 +738,11 @@ eth_em_stop(struct rte_eth_dev *dev)
> 	em_lsc_intr_disable(hw);
> 
> 	e1000_reset_hw(hw);
>+
>+	/* Flush desc rings for i219 */
>+	if (hw->mac.type >= e1000_pch_spt)
>+		em_flush_desc_rings(dev);
>+
> 	if (hw->mac.type >= e1000_82544)
> 		E1000_WRITE_REG(hw, E1000_WUC, 0);
> 
>diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
>index 708f832..96c10cd 100644
>--- a/drivers/net/e1000/em_rxtx.c
>+++ b/drivers/net/e1000/em_rxtx.c
>@@ -18,6 +18,7 @@
> #include <rte_log.h>
> #include <rte_debug.h>
> #include <rte_pci.h>
>+#include <rte_bus_pci.h>
> #include <rte_memory.h>
> #include <rte_memcpy.h>
> #include <rte_memzone.h>
>@@ -59,6 +60,11 @@
> #define E1000_TX_OFFLOAD_NOTSUP_MASK \
> 		(PKT_TX_OFFLOAD_MASK ^ E1000_TX_OFFLOAD_MASK)
> 
>+/* PCI offset for querying configuration status register */
>+#define PCI_CFG_STATUS_REG                 0x06
>+#define FLUSH_DESC_REQUIRED               0x100
>+
>+
> /**
>  * Structure associated with each descriptor of the RX ring of a RX queue.
>  */
>@@ -2000,3 +2006,105 @@ em_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
> 	qinfo->conf.tx_rs_thresh = txq->tx_rs_thresh;
> 	qinfo->conf.offloads = txq->offloads;
> }
>+
>+static void e1000_flush_tx_ring(struct rte_eth_dev *dev)

Minor nit, according to dpdk community's coding style, the function type should
be on a line by itself preceding the function, like

static void
e1000_flush_tx_ring(struct rte_eth_dev *dev)

>+{
>+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>+	volatile struct e1000_data_desc *tx_desc;
>+	volatile uint32_t *tdt_reg_addr;
>+	uint32_t tdt, tctl, txd_lower = E1000_TXD_CMD_IFCS;
>+	uint16_t size = 512;
>+	struct em_tx_queue *txq;
>+	int i;
>+
>+	if (dev->data->tx_queues == NULL)
>+		return;
>+	tctl = E1000_READ_REG(hw, E1000_TCTL);
>+	E1000_WRITE_REG(hw, E1000_TCTL, tctl | E1000_TCTL_EN);
>+	for (i = 0; i < dev->data->nb_tx_queues &&
>+		i < E1000_I219_MAX_TX_QUEUE_NUM; i++) {
>+		txq = dev->data->tx_queues[i];
>+		tdt = E1000_READ_REG(hw, E1000_TDT(i));
>+		if (tdt != txq->tx_tail)
>+			return;
>+		tx_desc = &txq->tx_ring[txq->tx_tail];
>+		tx_desc->buffer_addr = rte_cpu_to_le_64(txq->tx_ring_phys_addr);
>+		tx_desc->lower.data = rte_cpu_to_le_32(txd_lower | size);
>+		tx_desc->upper.data = 0;
>+
>+		rte_wmb();
>+		txq->tx_tail++;
>+		if (txq->tx_tail == txq->nb_tx_desc)
>+			txq->tx_tail = 0;
>+		rte_io_wmb();
>+		tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(i));
>+		E1000_PCI_REG_WRITE_RELAXED(tdt_reg_addr, txq->tx_tail);
>+		usec_delay(250);
>+	}
>+}
>+
>+static void e1000_flush_rx_ring(struct rte_eth_dev *dev)

Ditto.

>+{
>+	uint32_t rctl, rxdctl;
>+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>+	int i;
>+
>+	rctl = E1000_READ_REG(hw, E1000_RCTL);
>+	E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
>+	E1000_WRITE_FLUSH(hw);
>+	usec_delay(150);
>+
>+	for (i = 0; i < dev->data->nb_rx_queues &&
>+		i < E1000_I219_MAX_RX_QUEUE_NUM; i++) {
>+		rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
>+		/* zero the lower 14 bits (prefetch and host thresholds) */
>+		rxdctl &= 0xffffc000;
>+
>+		/* update thresholds: prefetch threshold to 31,
>+		 * host threshold to 1 and make sure the granularity
>+		 * is "descriptors" and not "cache lines"
>+		 */
>+		rxdctl |= (0x1F | (1UL << 8) | E1000_RXDCTL_THRESH_UNIT_DESC);
>+
>+		E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
>+	}
>+	/* momentarily enable the RX ring for the changes to take effect */
>+	E1000_WRITE_REG(hw, E1000_RCTL, rctl | E1000_RCTL_EN);
>+	E1000_WRITE_FLUSH(hw);
>+	usec_delay(150);
>+	E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
>+}
>+
>+/**
>+ * em_flush_desc_rings - remove all descriptors from the descriptor rings
>+ *
>+ * In i219, the descriptor rings must be emptied before resetting/closing the
>+ * HW. Failure to do this will cause the HW to enter a unit hang state which
>+ * can only be released by PCI reset on the device
>+ *
>+ */
>+
>+void em_flush_desc_rings(struct rte_eth_dev *dev)

Ditto.


For the rest part, 
Reviewed-by: Xiaolong Ye <xiaolong.ye@intel.com>

>+{
>+	uint32_t fextnvm11, tdlen;
>+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>+	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
>+	uint16_t pci_cfg_status = 0;
>+
>+	fextnvm11 = E1000_READ_REG(hw, E1000_FEXTNVM11);
>+	E1000_WRITE_REG(hw, E1000_FEXTNVM11,
>+			fextnvm11 | E1000_FEXTNVM11_DISABLE_MULR_FIX);
>+	tdlen = E1000_READ_REG(hw, E1000_TDLEN(0));
>+	rte_pci_read_config(pci_dev, &pci_cfg_status, sizeof(pci_cfg_status),
>+				PCI_CFG_STATUS_REG);
>+
>+	/* do nothing if we're not in faulty state, or if the queue is empty */
>+	if ((pci_cfg_status & FLUSH_DESC_REQUIRED) && tdlen) {
>+		/* flush desc ring */
>+		e1000_flush_tx_ring(dev);
>+		rte_pci_read_config(pci_dev, &pci_cfg_status,
>+				sizeof(pci_cfg_status), PCI_CFG_STATUS_REG);
>+		if (pci_cfg_status & FLUSH_DESC_REQUIRED)
>+			e1000_flush_rx_ring(dev);
>+	}
>+}
>-- 
>2.7.4
>

Patch

diff --git a/drivers/net/e1000/e1000_ethdev.h b/drivers/net/e1000/e1000_ethdev.h
index 67acb73..01ff943 100644
--- a/drivers/net/e1000/e1000_ethdev.h
+++ b/drivers/net/e1000/e1000_ethdev.h
@@ -35,6 +35,9 @@ 
 #define IGB_MAX_RX_QUEUE_NUM           8
 #define IGB_MAX_RX_QUEUE_NUM_82576     16
 
+#define E1000_I219_MAX_RX_QUEUE_NUM		2
+#define E1000_I219_MAX_TX_QUEUE_NUM		2
+
 #define E1000_SYN_FILTER_ENABLE        0x00000001 /* syn filter enable field */
 #define E1000_SYN_FILTER_QUEUE         0x0000000E /* syn filter queue field */
 #define E1000_SYN_FILTER_QUEUE_SHIFT   1          /* syn filter queue field */
@@ -522,5 +525,6 @@  int igb_action_rss_same(const struct rte_flow_action_rss *comp,
 int igb_config_rss_filter(struct rte_eth_dev *dev,
 			struct igb_rte_flow_rss_conf *conf,
 			bool add);
+void em_flush_desc_rings(struct rte_eth_dev *dev);
 
 #endif /* _E1000_ETHDEV_H_ */
diff --git a/drivers/net/e1000/em_ethdev.c b/drivers/net/e1000/em_ethdev.c
index dc88661..62d3a95 100644
--- a/drivers/net/e1000/em_ethdev.c
+++ b/drivers/net/e1000/em_ethdev.c
@@ -738,6 +738,11 @@  eth_em_stop(struct rte_eth_dev *dev)
 	em_lsc_intr_disable(hw);
 
 	e1000_reset_hw(hw);
+
+	/* Flush desc rings for i219 */
+	if (hw->mac.type >= e1000_pch_spt)
+		em_flush_desc_rings(dev);
+
 	if (hw->mac.type >= e1000_82544)
 		E1000_WRITE_REG(hw, E1000_WUC, 0);
 
diff --git a/drivers/net/e1000/em_rxtx.c b/drivers/net/e1000/em_rxtx.c
index 708f832..96c10cd 100644
--- a/drivers/net/e1000/em_rxtx.c
+++ b/drivers/net/e1000/em_rxtx.c
@@ -18,6 +18,7 @@ 
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_pci.h>
+#include <rte_bus_pci.h>
 #include <rte_memory.h>
 #include <rte_memcpy.h>
 #include <rte_memzone.h>
@@ -59,6 +60,11 @@ 
 #define E1000_TX_OFFLOAD_NOTSUP_MASK \
 		(PKT_TX_OFFLOAD_MASK ^ E1000_TX_OFFLOAD_MASK)
 
+/* PCI offset for querying configuration status register */
+#define PCI_CFG_STATUS_REG                 0x06
+#define FLUSH_DESC_REQUIRED               0x100
+
+
 /**
  * Structure associated with each descriptor of the RX ring of a RX queue.
  */
@@ -2000,3 +2006,105 @@  em_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 	qinfo->conf.tx_rs_thresh = txq->tx_rs_thresh;
 	qinfo->conf.offloads = txq->offloads;
 }
+
+static void e1000_flush_tx_ring(struct rte_eth_dev *dev)
+{
+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	volatile struct e1000_data_desc *tx_desc;
+	volatile uint32_t *tdt_reg_addr;
+	uint32_t tdt, tctl, txd_lower = E1000_TXD_CMD_IFCS;
+	uint16_t size = 512;
+	struct em_tx_queue *txq;
+	int i;
+
+	if (dev->data->tx_queues == NULL)
+		return;
+	tctl = E1000_READ_REG(hw, E1000_TCTL);
+	E1000_WRITE_REG(hw, E1000_TCTL, tctl | E1000_TCTL_EN);
+	for (i = 0; i < dev->data->nb_tx_queues &&
+		i < E1000_I219_MAX_TX_QUEUE_NUM; i++) {
+		txq = dev->data->tx_queues[i];
+		tdt = E1000_READ_REG(hw, E1000_TDT(i));
+		if (tdt != txq->tx_tail)
+			return;
+		tx_desc = &txq->tx_ring[txq->tx_tail];
+		tx_desc->buffer_addr = rte_cpu_to_le_64(txq->tx_ring_phys_addr);
+		tx_desc->lower.data = rte_cpu_to_le_32(txd_lower | size);
+		tx_desc->upper.data = 0;
+
+		rte_wmb();
+		txq->tx_tail++;
+		if (txq->tx_tail == txq->nb_tx_desc)
+			txq->tx_tail = 0;
+		rte_io_wmb();
+		tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(i));
+		E1000_PCI_REG_WRITE_RELAXED(tdt_reg_addr, txq->tx_tail);
+		usec_delay(250);
+	}
+}
+
+static void e1000_flush_rx_ring(struct rte_eth_dev *dev)
+{
+	uint32_t rctl, rxdctl;
+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	int i;
+
+	rctl = E1000_READ_REG(hw, E1000_RCTL);
+	E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
+	E1000_WRITE_FLUSH(hw);
+	usec_delay(150);
+
+	for (i = 0; i < dev->data->nb_rx_queues &&
+		i < E1000_I219_MAX_RX_QUEUE_NUM; i++) {
+		rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
+		/* zero the lower 14 bits (prefetch and host thresholds) */
+		rxdctl &= 0xffffc000;
+
+		/* update thresholds: prefetch threshold to 31,
+		 * host threshold to 1 and make sure the granularity
+		 * is "descriptors" and not "cache lines"
+		 */
+		rxdctl |= (0x1F | (1UL << 8) | E1000_RXDCTL_THRESH_UNIT_DESC);
+
+		E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
+	}
+	/* momentarily enable the RX ring for the changes to take effect */
+	E1000_WRITE_REG(hw, E1000_RCTL, rctl | E1000_RCTL_EN);
+	E1000_WRITE_FLUSH(hw);
+	usec_delay(150);
+	E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
+}
+
+/**
+ * em_flush_desc_rings - remove all descriptors from the descriptor rings
+ *
+ * In i219, the descriptor rings must be emptied before resetting/closing the
+ * HW. Failure to do this will cause the HW to enter a unit hang state which
+ * can only be released by PCI reset on the device
+ *
+ */
+
+void em_flush_desc_rings(struct rte_eth_dev *dev)
+{
+	uint32_t fextnvm11, tdlen;
+	struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
+	uint16_t pci_cfg_status = 0;
+
+	fextnvm11 = E1000_READ_REG(hw, E1000_FEXTNVM11);
+	E1000_WRITE_REG(hw, E1000_FEXTNVM11,
+			fextnvm11 | E1000_FEXTNVM11_DISABLE_MULR_FIX);
+	tdlen = E1000_READ_REG(hw, E1000_TDLEN(0));
+	rte_pci_read_config(pci_dev, &pci_cfg_status, sizeof(pci_cfg_status),
+				PCI_CFG_STATUS_REG);
+
+	/* do nothing if we're not in faulty state, or if the queue is empty */
+	if ((pci_cfg_status & FLUSH_DESC_REQUIRED) && tdlen) {
+		/* flush desc ring */
+		e1000_flush_tx_ring(dev);
+		rte_pci_read_config(pci_dev, &pci_cfg_status,
+				sizeof(pci_cfg_status), PCI_CFG_STATUS_REG);
+		if (pci_cfg_status & FLUSH_DESC_REQUIRED)
+			e1000_flush_rx_ring(dev);
+	}
+}