[09/13] net/bnxt: add code for periodic FW health monitoring

Message ID 20190822055400.30119-10-ajit.khaparde@broadcom.com (mailing list archive)
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers
Series bnxt patchset to support device error recovery |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Ajit Khaparde Aug. 22, 2019, 5:53 a.m. UTC
  From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>

Periodically poll the FW heartbeat register and FW recovery counter
registers to check the FW health. Polling frequency will be
advertised by the FW in HWRM_ERROR_RECOVERY_QCFG response.
Schedule the task upon receiving the async event from FW.

Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
---
 drivers/net/bnxt/bnxt.h        |  5 ++
 drivers/net/bnxt/bnxt_cpr.c    |  7 +++
 drivers/net/bnxt/bnxt_ethdev.c | 89 ++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)
  

Patch

diff --git a/drivers/net/bnxt/bnxt.h b/drivers/net/bnxt/bnxt.h
index f9147a9a8..a23c4a64c 100644
--- a/drivers/net/bnxt/bnxt.h
+++ b/drivers/net/bnxt/bnxt.h
@@ -368,6 +368,9 @@  struct bnxt_error_recovery_info {
 #define BNXT_FLAG_MASTER_FUNC		(1 << 2)
 #define BNXT_FLAG_RECOVERY_ENABLED	(1 << 3)
 	uint32_t	flags;
+
+	uint32_t        last_heart_beat;
+	uint32_t        last_reset_counter;
 };
 
 /* address space location of register */
@@ -531,6 +534,8 @@  int bnxt_rcv_msg_from_vf(struct bnxt *bp, uint16_t vf_id, void *msg);
 int is_bnxt_in_error(struct bnxt *bp);
 
 int bnxt_map_fw_health_status_regs(struct bnxt *bp);
+uint32_t bnxt_read_fw_status_reg(struct bnxt *bp, uint32_t index);
+void bnxt_schedule_fw_health_check(struct bnxt *bp);
 
 bool is_bnxt_supported(struct rte_eth_dev *dev);
 bool bnxt_stratus_device(struct bnxt *bp);
diff --git a/drivers/net/bnxt/bnxt_cpr.c b/drivers/net/bnxt/bnxt_cpr.c
index 7f5b3314e..a692fbe7c 100644
--- a/drivers/net/bnxt/bnxt_cpr.c
+++ b/drivers/net/bnxt/bnxt_cpr.c
@@ -88,6 +88,13 @@  void bnxt_handle_async_event(struct bnxt *bp,
 		PMD_DRV_LOG(INFO, "recovery enabled(%d), master function(%d)\n",
 			    bnxt_is_recovery_enabled(bp),
 			    bnxt_is_master_func(bp));
+
+		info->last_heart_beat =
+			bnxt_read_fw_status_reg(bp, BNXT_FW_HEARTBEAT_CNT_REG);
+		info->last_reset_counter =
+			bnxt_read_fw_status_reg(bp, BNXT_FW_RECOVERY_CNT_REG);
+
+		bnxt_schedule_fw_health_check(bp);
 		break;
 	default:
 		PMD_DRV_LOG(INFO, "handle_async_event id = 0x%x\n", event_id);
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 52c460d2c..0317eb888 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -169,6 +169,7 @@  static int bnxt_mtu_set_op(struct rte_eth_dev *eth_dev, uint16_t new_mtu);
 static int bnxt_dev_uninit(struct rte_eth_dev *eth_dev);
 static int bnxt_init_resources(struct bnxt *bp, bool reconfig_dev);
 static int bnxt_uninit_resources(struct bnxt *bp, bool reconfig_dev);
+static void bnxt_cancel_fw_health_check(struct bnxt *bp);
 
 int is_bnxt_in_error(struct bnxt *bp)
 {
@@ -880,6 +881,8 @@  static void bnxt_dev_stop_op(struct rte_eth_dev *eth_dev)
 	/* disable uio/vfio intr/eventfd mapping */
 	rte_intr_disable(intr_handle);
 
+	bnxt_cancel_fw_health_check(bp);
+
 	bp->flags &= ~BNXT_FLAG_INIT_DONE;
 	if (bp->eth_dev->data->dev_started) {
 		/* TBD: STOP HW queues DMA */
@@ -3608,6 +3611,92 @@  int bnxt_dev_reset_and_resume(struct bnxt *bp)
 	return rc;
 }
 
+uint32_t bnxt_read_fw_status_reg(struct bnxt *bp, uint32_t index)
+{
+	struct bnxt_error_recovery_info *info = bp->recovery_info;
+	uint32_t reg = info->status_regs[index];
+	uint32_t type, offset, val = 0;
+
+	type = BNXT_FW_STATUS_REG_TYPE(reg);
+	offset = BNXT_FW_STATUS_REG_OFF(reg);
+
+	switch (type) {
+	case BNXT_FW_STATUS_REG_TYPE_CFG:
+		rte_pci_read_config(bp->pdev, &val, sizeof(val), offset);
+		break;
+	case BNXT_FW_STATUS_REG_TYPE_GRC:
+		offset = info->mapped_status_regs[index];
+		/* FALLTHROUGH */
+	case BNXT_FW_STATUS_REG_TYPE_BAR0:
+		val = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
+				       offset));
+		break;
+	}
+
+	return val;
+}
+
+/* Driver should poll FW heartbeat, reset_counter with the frequency
+ * advertised by FW in HWRM_ERROR_RECOVERY_QCFG.
+ * When the driver detects heartbeat stop or change in reset_counter,
+ * it has to trigger a reset to recover from the error condition.
+ * A “master PF” is the function who will have the privilege to
+ * initiate the chimp reset. The master PF will be elected by the
+ * firmware and will be notified through async message.
+ */
+static void bnxt_check_fw_health(void *arg)
+{
+	struct bnxt *bp = arg;
+	struct bnxt_error_recovery_info *info = bp->recovery_info;
+	uint32_t val = 0;
+
+	if (!info || !bnxt_is_recovery_enabled(bp) ||
+	    is_bnxt_in_error(bp))
+		return;
+
+	val = bnxt_read_fw_status_reg(bp, BNXT_FW_HEARTBEAT_CNT_REG);
+	if (val == info->last_heart_beat)
+		goto reset;
+
+	info->last_heart_beat = val;
+
+	val = bnxt_read_fw_status_reg(bp, BNXT_FW_RECOVERY_CNT_REG);
+	if (val != info->last_reset_counter)
+		goto reset;
+
+	info->last_reset_counter = val;
+
+	rte_eal_alarm_set(US_PER_MS * info->driver_polling_freq,
+			  bnxt_check_fw_health, (void *)bp);
+
+	return;
+reset:
+	/* Stop DMA to/from device */
+	bp->flags |= BNXT_FLAG_FATAL_ERROR;
+	bp->flags |= BNXT_FLAG_FW_RESET;
+
+	PMD_DRV_LOG(ERR, "Detected FW dead condition\n");
+}
+
+void bnxt_schedule_fw_health_check(struct bnxt *bp)
+{
+	uint32_t polling_freq = bp->recovery_info->driver_polling_freq;
+
+	if (!bnxt_is_recovery_enabled(bp))
+		return;
+
+	rte_eal_alarm_set(US_PER_MS * polling_freq,
+			  bnxt_check_fw_health, (void *)bp);
+}
+
+static void bnxt_cancel_fw_health_check(struct bnxt *bp)
+{
+	if (!bnxt_is_recovery_enabled(bp))
+		return;
+
+	rte_eal_alarm_cancel(bnxt_check_fw_health, (void *)bp);
+}
+
 static bool bnxt_vf_pciid(uint16_t id)
 {
 	if (id == BROADCOM_DEV_ID_57304_VF ||