diff mbox

[dpdk-dev,v2,5/5] l3fwd-power: enable one-shot rx interrupt and polling/interrupt mode switch

Message ID 1422951511-28143-6-git-send-email-danny.zhou@intel.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Zhou, Danny Feb. 3, 2015, 8:18 a.m. UTC
v2 change
- Remove unused function which is for debug purpose

Demonstrate how to handle per rx queue interrupt in a NAPI-like
implementation in usersapce. PDK polling thread mainly works in 
polling mode and switch to interrupt mode only if there is no 
any packet received in recent polls.
Usersapce interrupt notification generally takes a lot more cycles than
kernel, so one-shot interrupt is used here to guarantee minimum overhead
and DPDK polling thread returns to polling mode immediately once it
receives an interrupt notificaiton for incoming packet.

Signed-off-by: Danny Zhou <danny.zhou@intel.com>
Tested-by: Yong Liu <yong.liu@intel.com>
---
 examples/l3fwd-power/main.c | 141 +++++++++++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 41 deletions(-)
diff mbox

Patch

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f6b55b9..15f0a5a 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -75,12 +75,13 @@ 
 #include <rte_string_fns.h>
 #include <rte_timer.h>
 #include <rte_power.h>
+#include <rte_eal.h>
 
 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
 
 #define MAX_PKT_BURST 32
 
-#define MIN_ZERO_POLL_COUNT 5
+#define MIN_ZERO_POLL_COUNT 10
 
 /* around 100ms at 2 Ghz */
 #define TIMER_RESOLUTION_CYCLES           200000000ULL
@@ -188,6 +189,9 @@  struct lcore_rx_queue {
 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
+#define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
+
+
 #define MAX_LCORE_PARAMS 1024
 struct lcore_params {
 	uint8_t port_id;
@@ -214,7 +218,7 @@  static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
 
 static struct rte_eth_conf port_conf = {
 	.rxmode = {
-		.mq_mode	= ETH_MQ_RX_RSS,
+		.mq_mode = ETH_MQ_RX_RSS,
 		.max_rx_pkt_len = ETHER_MAX_LEN,
 		.split_hdr_size = 0,
 		.header_split   = 0, /**< Header Split disabled */
@@ -226,11 +230,14 @@  static struct rte_eth_conf port_conf = {
 	.rx_adv_conf = {
 		.rss_conf = {
 			.rss_key = NULL,
-			.rss_hf = ETH_RSS_IP,
+			.rss_hf = ETH_RSS_UDP,
 		},
 	},
 	.txmode = {
-		.mq_mode = ETH_DCB_NONE,
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.intr_conf = {
+		.rxq = 1, /**< rxq interrupt feature enabled */
 	},
 };
 
@@ -402,19 +409,22 @@  power_timer_cb(__attribute__((unused)) struct rte_timer *tim,
 	/* accumulate total execution time in us when callback is invoked */
 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
 					(float)SCALING_PERIOD;
-
 	/**
 	 * check whether need to scale down frequency a step if it sleep a lot.
 	 */
-	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD)
-		rte_power_freq_down(lcore_id);
+	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
-		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST)
+		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
 		/**
 		 * scale down a step if average packet per iteration less
 		 * than expectation.
 		 */
-		rte_power_freq_down(lcore_id);
+		if (rte_power_freq_down)
+			rte_power_freq_down(lcore_id);
+	}
 
 	/**
 	 * initialize another timer according to current frequency to ensure
@@ -707,22 +717,20 @@  l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
 
 }
 
-#define SLEEP_GEAR1_THRESHOLD            100
-#define SLEEP_GEAR2_THRESHOLD            1000
+#define MINIMUM_SLEEP_TIME         1
+#define SUSPEND_THRESHOLD          300
 
 static inline uint32_t
 power_idle_heuristic(uint32_t zero_rx_packet_count)
 {
-	/* If zero count is less than 100, use it as the sleep time in us */
-	if (zero_rx_packet_count < SLEEP_GEAR1_THRESHOLD)
-		return zero_rx_packet_count;
-	/* If zero count is less than 1000, sleep time should be 100 us */
-	else if ((zero_rx_packet_count >= SLEEP_GEAR1_THRESHOLD) &&
-			(zero_rx_packet_count < SLEEP_GEAR2_THRESHOLD))
-		return SLEEP_GEAR1_THRESHOLD;
-	/* If zero count is greater than 1000, sleep time should be 1000 us */
-	else if (zero_rx_packet_count >= SLEEP_GEAR2_THRESHOLD)
-		return SLEEP_GEAR2_THRESHOLD;
+	/* If zero count is less than 100,  sleep 1us */
+	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
+		return MINIMUM_SLEEP_TIME;
+	/* If zero count is less than 1000, sleep 100 us which is the minimum latency
+	    switching from C3/C6 to C0
+	*/
+	else
+		return SUSPEND_THRESHOLD;
 
 	return 0;
 }
@@ -762,6 +770,35 @@  power_freq_scaleup_heuristic(unsigned lcore_id,
 	return FREQ_CURRENT;
 }
 
+/**
+ * force polling thread sleep until one-shot rx interrupt triggers
+ * @param port_id
+ *  Port id.
+ * @param queue_id
+ *  Rx queue id.
+ * @return
+ *  0 on success
+ */
+static int
+sleep_until_rx_interrupt(uint8_t port_id, uint8_t queue_id)
+{
+	/* Enable one-shot rx interrupt */
+	rte_eth_dev_rx_queue_intr_enable(port_id, queue_id);
+
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u sleeps until interrupt on port%d,rxq%d triggers\n",
+		rte_lcore_id(), port_id, queue_id);
+	rte_eal_wait_rx_intr(port_id, queue_id);
+	RTE_LOG(INFO, L3FWD_POWER,
+		"lcore %u is waked up from rx interrupt on port%d,rxq%d\n",
+		rte_lcore_id(), port_id, queue_id);
+
+	/* Disable one-shot rx interrupt */
+	rte_eth_dev_rx_queue_intr_disable(port_id, queue_id);
+
+	return 0;
+}
+
 /* main processing loop */
 static int
 main_loop(__attribute__((unused)) void *dummy)
@@ -775,7 +812,6 @@  main_loop(__attribute__((unused)) void *dummy)
 	struct lcore_conf *qconf;
 	struct lcore_rx_queue *rx_queue;
 	enum freq_scale_hint_t lcore_scaleup_hint;
-
 	uint32_t lcore_rx_idle_count = 0;
 	uint32_t lcore_idle_hint = 0;
 
@@ -835,6 +871,8 @@  main_loop(__attribute__((unused)) void *dummy)
 			prev_tsc_power = cur_tsc_power;
 		}
 
+
+start_rx:
 		/*
 		 * Read packet from RX queues
 		 */
@@ -848,6 +886,7 @@  main_loop(__attribute__((unused)) void *dummy)
 
 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
 								MAX_PKT_BURST);
+
 			stats[lcore_id].nb_rx_processed += nb_rx;
 			if (unlikely(nb_rx == 0)) {
 				/**
@@ -910,10 +949,13 @@  main_loop(__attribute__((unused)) void *dummy)
 						rx_queue->freq_up_hint;
 			}
 
-			if (lcore_scaleup_hint == FREQ_HIGHEST)
-				rte_power_freq_max(lcore_id);
-			else if (lcore_scaleup_hint == FREQ_HIGHER)
-				rte_power_freq_up(lcore_id);
+			if (lcore_scaleup_hint == FREQ_HIGHEST) {
+				if (rte_power_freq_max)
+					rte_power_freq_max(lcore_id);
+			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
+				if (rte_power_freq_up)
+					rte_power_freq_up(lcore_id);
+			}
 		} else {
 			/**
 			 * All Rx queues empty in recent consecutive polls,
@@ -928,16 +970,21 @@  main_loop(__attribute__((unused)) void *dummy)
 					lcore_idle_hint = rx_queue->idle_hint;
 			}
 
-			if ( lcore_idle_hint < SLEEP_GEAR1_THRESHOLD)
+			if (lcore_idle_hint < SUSPEND_THRESHOLD)
 				/**
-				 * execute "pause" instruction to avoid context
-				 * switch for short sleep.
- 				 */
+				* execute "pause" instruction to avoid context
+				* switch which generally take hundres of microsecond
+				* for short sleep.
+				*/
 				rte_delay_us(lcore_idle_hint);
-			else
-				/* long sleep force runing thread to suspend */
-				usleep(lcore_idle_hint);
-
+			else {
+				/* suspend untill rx interrupt trigges */
+				sleep_until_rx_interrupt(
+					qconf->rx_queue_list[0].port_id,
+					qconf->rx_queue_list[0].queue_id);
+				/* start receiving packets immediately */
+				goto start_rx;
+			}
 			stats[lcore_id].sleep_time += lcore_idle_hint;
 		}
 	}
@@ -1270,7 +1317,7 @@  setup_hash(int socketid)
 	char s[64];
 
 	/* create ipv4 hash */
-	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
 	ipv4_l3fwd_hash_params.name = s;
 	ipv4_l3fwd_hash_params.socket_id = socketid;
 	ipv4_l3fwd_lookup_struct[socketid] =
@@ -1280,7 +1327,7 @@  setup_hash(int socketid)
 				"socket %d\n", socketid);
 
 	/* create ipv6 hash */
-	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
+	rte_snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
 	ipv6_l3fwd_hash_params.name = s;
 	ipv6_l3fwd_hash_params.socket_id = socketid;
 	ipv6_l3fwd_lookup_struct[socketid] =
@@ -1476,6 +1523,7 @@  main(int argc, char **argv)
 	unsigned lcore_id;
 	uint64_t hz;
 	uint32_t n_tx_queue, nb_lcores;
+	uint32_t dev_rxq_num, dev_txq_num;
 	uint8_t portid, nb_rx_queue, queue, socketid;
 
 	/* catch SIGINT and restore cpufreq governor to ondemand */
@@ -1525,10 +1573,18 @@  main(int argc, char **argv)
 		printf("Initializing port %d ... ", portid );
 		fflush(stdout);
 
+		rte_eth_dev_info_get(portid, &dev_info);
+		dev_rxq_num = dev_info.max_rx_queues;
+		dev_txq_num = dev_info.max_tx_queues;
+
 		nb_rx_queue = get_port_n_rx_queues(portid);
+		if (nb_rx_queue > dev_rxq_num)
+			rte_exit(EXIT_FAILURE, "Cannot configure not existed rxq: "
+					"port=%d\n", portid);
+
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
+		if (n_tx_queue > dev_txq_num)
+			n_tx_queue = dev_txq_num;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
@@ -1552,6 +1608,9 @@  main(int argc, char **argv)
 			if (rte_lcore_is_enabled(lcore_id) == 0)
 				continue;
 
+			if (queueid >= dev_txq_num)
+				continue;
+
 			if (numa_on)
 				socketid = \
 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
@@ -1586,8 +1645,9 @@  main(int argc, char **argv)
 		/* init power management library */
 		ret = rte_power_init(lcore_id);
 		if (ret)
-			rte_exit(EXIT_FAILURE, "Power management library "
-				"initialization failed on core%u\n", lcore_id);
+			rte_log(RTE_LOG_ERR, RTE_LOGTYPE_POWER,
+				"Power management library initialization "
+				"failed on core%u", lcore_id);
 
 		/* init timer structures for each enabled lcore */
 		rte_timer_init(&power_timers[lcore_id]);
@@ -1635,7 +1695,6 @@  main(int argc, char **argv)
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
 						"port=%d\n", ret, portid);
-
 		/*
 		 * If enabled, put device in promiscuous mode.
 		 * This allows IO forwarding mode to forward packets