diff mbox series

[v2] pflock: implementation of phase-fair reader writer locks

Message ID 20210303183019.70700-1-sthemmin@microsoft.com (mailing list archive)
State Superseded
Headers show
Series [v2] pflock: implementation of phase-fair reader writer locks | expand

Checks

Context Check Description
ci/iol-testing success Testing PASS
ci/iol-abi-testing success Testing PASS
ci/github-robot success github build: passed
ci/iol-mellanox-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/travis-robot success travis build: passed
ci/intel-Testing success Testing PASS
ci/Intel-compilation success Compilation OK
ci/checkpatch warning coding style issues

Commit Message

Stephen Hemminger March 3, 2021, 6:30 p.m. UTC
This is a new type of reader-writer lock that provides better fairness
guarantees which makes it better for typical DPDK applications.
They lock internally uses two ticket pools, one for readers and one
for writers.

Phase fair reader writer locks ensure that neither reader or writer will be
starved. Neither reader or writer are preferred, they execute in
alternating phases. All operations of the same time (reader or writer)
that try to acquire the lock are handled in FIFO order.  Write
operations are exclusive, and multiple read operations can be run
together (until a write arrives).

A similar implementation is in Concurrency Kit package in FreeBSD.
For more information see:
   "Reader-Writer Synchronization for Shared-Memory Multiprocessor
    Real-Time Systems",
    http://www.cs.unc.edu/~anderson/papers/ecrts09b.pdf

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v2 - fix truncation of mask (reported by some compilers)
     and docbook formatting

 app/test/meson.build                        |   6 +
 app/test/test_pflock.c                      | 542 ++++++++++++++++++++
 lib/librte_eal/arm/include/meson.build      |   1 +
 lib/librte_eal/arm/include/rte_pflock.h     |  18 +
 lib/librte_eal/include/generic/rte_pflock.h | 273 ++++++++++
 lib/librte_eal/ppc/include/meson.build      |   1 +
 lib/librte_eal/ppc/include/rte_pflock.h     |  16 +
 lib/librte_eal/x86/include/meson.build      |   1 +
 lib/librte_eal/x86/include/rte_pflock.h     |  18 +
 9 files changed, 876 insertions(+)
 create mode 100644 app/test/test_pflock.c
 create mode 100644 lib/librte_eal/arm/include/rte_pflock.h
 create mode 100644 lib/librte_eal/include/generic/rte_pflock.h
 create mode 100644 lib/librte_eal/ppc/include/rte_pflock.h
 create mode 100644 lib/librte_eal/x86/include/rte_pflock.h
diff mbox series

Patch

diff --git a/app/test/meson.build b/app/test/meson.build
index 561e493a2944..134098de9ac2 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -90,6 +90,7 @@  test_sources = files('commands.c',
 	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
+	'test_pflock.c',
 	'test_pmd_perf.c',
 	'test_power.c',
 	'test_power_cpufreq.c',
@@ -228,6 +229,11 @@  fast_tests = [
         ['meter_autotest', true],
         ['multiprocess_autotest', false],
         ['per_lcore_autotest', true],
+        ['pflock_autotest', true],
+        ['pflock_test1_autotest', true],
+        ['pflock_rda_autotest', true],
+        ['pflock_rds_wrm_autotest', true],
+        ['pflock_rde_wro_autotest', true],
         ['prefetch_autotest', true],
         ['rcu_qsbr_autotest', true],
         ['red_autotest', true],
diff --git a/app/test/test_pflock.c b/app/test/test_pflock.c
new file mode 100644
index 000000000000..cd36c02cc710
--- /dev/null
+++ b/app/test/test_pflock.c
@@ -0,0 +1,542 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/queue.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_pause.h>
+#include <rte_pflock.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+
+#include "test.h"
+
+/*
+ * phase fair lock test
+ * ===========
+ * Provides UT for phase fair lock API.
+ * Main concern is on functional testing, but also provides some
+ * performance measurements.
+ * Obviously for proper testing need to be executed with more than one lcore.
+ */
+
+#define ITER_NUM	0x80
+
+#define TEST_SEC	5
+
+static rte_pflock_t sl;
+static rte_pflock_t sl_tab[RTE_MAX_LCORE];
+static uint32_t synchro;
+
+enum {
+	LC_TYPE_RDLOCK,
+	LC_TYPE_WRLOCK,
+};
+
+static struct {
+	rte_pflock_t lock;
+	uint64_t tick;
+	volatile union {
+		uint8_t u8[RTE_CACHE_LINE_SIZE];
+		uint64_t u64[RTE_CACHE_LINE_SIZE / sizeof(uint64_t)];
+	} data;
+} __rte_cache_aligned try_pflock_data;
+
+struct try_pflock_lcore {
+	int32_t rc;
+	int32_t type;
+	struct {
+		uint64_t tick;
+		uint64_t fail;
+		uint64_t success;
+	} stat;
+} __rte_cache_aligned;
+
+static struct try_pflock_lcore try_lcore_data[RTE_MAX_LCORE];
+
+static int
+test_pflock_per_core(__rte_unused void *arg)
+{
+	rte_pflock_write_lock(&sl);
+	printf("Global write lock taken on core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_pflock_write_lock(&sl_tab[rte_lcore_id()]);
+	printf("Hello from core %u !\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl_tab[rte_lcore_id()]);
+
+	rte_pflock_read_lock(&sl);
+	printf("Global read lock taken on core %u\n", rte_lcore_id());
+	rte_delay_ms(100);
+	printf("Release global read lock on core %u\n", rte_lcore_id());
+	rte_pflock_read_unlock(&sl);
+
+	return 0;
+}
+
+static rte_pflock_t lk = RTE_PFLOCK_INITIALIZER;
+static volatile uint64_t pflock_data;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+#define TEST_PFLOCK_DEBUG 0
+
+static int
+load_loop_fn(__rte_unused void *arg)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	/* wait synchro for workers */
+	if (lcore != rte_get_main_lcore())
+		rte_wait_until_equal_32(&synchro, 1, __ATOMIC_RELAXED);
+
+	begin = rte_rdtsc_precise();
+	while (lcount < MAX_LOOP) {
+		rte_pflock_write_lock(&lk);
+		++pflock_data;
+		rte_pflock_write_unlock(&lk);
+
+		rte_pflock_read_lock(&lk);
+		if (TEST_PFLOCK_DEBUG && !(lcount % 100))
+			printf("Core [%u] pflock_data = %"PRIu64"\n",
+				lcore, pflock_data);
+		rte_pflock_read_unlock(&lk);
+
+		lcount++;
+		/* delay to make lock duty cycle slightly realistic */
+		rte_pause();
+	}
+
+	time_diff = rte_rdtsc_precise() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_pflock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+
+	printf("\nPhase fair test on %u cores...\n", rte_lcore_count());
+
+	/* clear synchro and start workers */
+	synchro = 0;
+	if (rte_eal_mp_remote_launch(load_loop_fn, NULL, SKIP_MAIN) < 0)
+		return -1;
+
+	/* start synchro and launch test on main */
+	__atomic_store_n(&synchro, 1, __ATOMIC_RELAXED);
+	load_loop_fn(NULL);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] cost time = %"PRIu64" us\n",
+			i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total cost time = %"PRIu64" us\n", total);
+	memset(time_count, 0, sizeof(time_count));
+
+	return 0;
+}
+
+/*
+ * - There is a global pflock and a table of pflocks (one per lcore).
+ *
+ * - The test function takes all of these locks and launches the
+ *   ``test_pflock_per_core()`` function on each core (except the main).
+ *
+ *   - The function takes the global write lock, display something,
+ *     then releases the global lock.
+ *   - Then, it takes the per-lcore write lock, display something, and
+ *     releases the per-core lock.
+ *   - Finally, a read lock is taken during 100 ms, then released.
+ *
+ * - The main function unlocks the per-lcore locks sequentially and
+ *   waits between each lock. This triggers the display of a message
+ *   for each core, in the correct order.
+ *
+ *   Then, it tries to take the global write lock and display the last
+ *   message. The autotest script checks that the message order is correct.
+ */
+static int
+pflock_test1(void)
+{
+	int i;
+
+	rte_pflock_init(&sl);
+	for (i = 0; i < RTE_MAX_LCORE; i++)
+		rte_pflock_init(&sl_tab[i]);
+
+	rte_pflock_write_lock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_lock(&sl_tab[i]);
+		rte_eal_remote_launch(test_pflock_per_core, NULL, i);
+	}
+
+	rte_pflock_write_unlock(&sl);
+
+	RTE_LCORE_FOREACH_WORKER(i) {
+		rte_pflock_write_unlock(&sl_tab[i]);
+		rte_delay_ms(100);
+	}
+
+	rte_pflock_write_lock(&sl);
+	/* this message should be the last message of test */
+	printf("Global write lock taken on main core %u\n", rte_lcore_id());
+	rte_pflock_write_unlock(&sl);
+
+	rte_eal_mp_wait_lcore();
+
+	if (test_pflock_perf() < 0)
+		return -1;
+
+	return 0;
+}
+
+static int
+test_pflock(void)
+{
+	uint32_t i;
+	int32_t rc, ret;
+
+	static const struct {
+		const char *name;
+		int (*ftst)(void);
+	} test[] = {
+		{
+			.name = "pflock_test1",
+			.ftst = pflock_test1,
+		},
+	};
+
+	ret = 0;
+	for (i = 0; i != RTE_DIM(test); i++) {
+		printf("starting test %s;\n", test[i].name);
+		rc = test[i].ftst();
+		printf("test %s completed with status %d\n", test[i].name, rc);
+		ret |= rc;
+	}
+
+	return ret;
+}
+
+static int
+try_read(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i;
+
+	rc = rte_pflock_read_trylock(&try_pflock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	for (i = 0; i != RTE_DIM(try_pflock_data.data.u64); i++) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u64[i] != 0) {
+			printf("%s(%u) error: unexpected data pattern\n",
+				__func__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+	}
+
+	rte_pflock_read_unlock(&try_pflock_data.lock);
+	return rc;
+}
+
+static int
+try_write(uint32_t lc)
+{
+	int32_t rc;
+	uint32_t i, v;
+
+	v = RTE_MAX(lc % UINT8_MAX, 1U);
+
+	rc = rte_pflock_write_trylock(&try_pflock_data.lock);
+	if (rc != 0)
+		return rc;
+
+	/* update by bytes in reverese order */
+	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u8[i] != 0) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_pflock_data.data.u8[i] = v;
+	}
+
+	/* restore by bytes in reverese order */
+	for (i = RTE_DIM(try_pflock_data.data.u8); i-- != 0; ) {
+
+		/* race condition occurred, lock doesn't work properly */
+		if (try_pflock_data.data.u8[i] != v) {
+			printf("%s:%d(%u) error: unexpected data pattern\n",
+				__func__, __LINE__, lc);
+			rte_memdump(stdout, NULL,
+				(void *)(uintptr_t)&try_pflock_data.data,
+				sizeof(try_pflock_data.data));
+			rc = -EFAULT;
+			break;
+		}
+
+		try_pflock_data.data.u8[i] = 0;
+	}
+
+	rte_pflock_write_unlock(&try_pflock_data.lock);
+	return rc;
+}
+
+static int
+try_read_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_pflock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_RDLOCK;
+
+	ftm = try_pflock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_read(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static int
+try_write_lcore(__rte_unused void *data)
+{
+	int32_t rc;
+	uint32_t i, lc;
+	uint64_t ftm, stm, tm;
+	struct try_pflock_lcore *lcd;
+
+	lc = rte_lcore_id();
+	lcd = try_lcore_data + lc;
+	lcd->type = LC_TYPE_WRLOCK;
+
+	ftm = try_pflock_data.tick;
+	stm = rte_get_timer_cycles();
+
+	do {
+		for (i = 0; i != ITER_NUM; i++) {
+			rc = try_write(lc);
+			if (rc == 0)
+				lcd->stat.success++;
+			else if (rc == -EBUSY)
+				lcd->stat.fail++;
+			else
+				break;
+			rc = 0;
+		}
+		tm = rte_get_timer_cycles() - stm;
+	} while (tm < ftm && rc == 0);
+
+	lcd->rc = rc;
+	lcd->stat.tick = tm;
+	return rc;
+}
+
+static void
+print_try_lcore_stats(const struct try_pflock_lcore *tlc, uint32_t lc)
+{
+	uint64_t f, s;
+
+	f = RTE_MAX(tlc->stat.fail, 1ULL);
+	s = RTE_MAX(tlc->stat.success, 1ULL);
+
+	printf("try_lcore_data[%u]={\n"
+		"\trc=%d,\n"
+		"\ttype=%s,\n"
+		"\tfail=%" PRIu64 ",\n"
+		"\tsuccess=%" PRIu64 ",\n"
+		"\tcycles=%" PRIu64 ",\n"
+		"\tcycles/op=%#Lf,\n"
+		"\tcycles/success=%#Lf,\n"
+		"\tsuccess/fail=%#Lf,\n"
+		"};\n",
+		lc,
+		tlc->rc,
+		tlc->type == LC_TYPE_RDLOCK ? "RDLOCK" : "WRLOCK",
+		tlc->stat.fail,
+		tlc->stat.success,
+		tlc->stat.tick,
+		(long double)tlc->stat.tick /
+		(tlc->stat.fail + tlc->stat.success),
+		(long double)tlc->stat.tick / s,
+		(long double)tlc->stat.success / f);
+}
+
+static void
+collect_try_lcore_stats(struct try_pflock_lcore *tlc,
+	const struct try_pflock_lcore *lc)
+{
+	tlc->stat.tick += lc->stat.tick;
+	tlc->stat.fail += lc->stat.fail;
+	tlc->stat.success += lc->stat.success;
+}
+
+/*
+ * Process collected results:
+ *  - check status
+ *  - collect and print statistics
+ */
+static int
+process_try_lcore_stats(void)
+{
+	int32_t rc;
+	uint32_t lc, rd, wr;
+	struct try_pflock_lcore rlc, wlc;
+
+	memset(&rlc, 0, sizeof(rlc));
+	memset(&wlc, 0, sizeof(wlc));
+
+	rlc.type = LC_TYPE_RDLOCK;
+	wlc.type = LC_TYPE_WRLOCK;
+	rd = 0;
+	wr = 0;
+
+	rc = 0;
+	RTE_LCORE_FOREACH(lc) {
+		rc |= try_lcore_data[lc].rc;
+		if (try_lcore_data[lc].type == LC_TYPE_RDLOCK) {
+			collect_try_lcore_stats(&rlc, try_lcore_data + lc);
+			rd++;
+		} else {
+			collect_try_lcore_stats(&wlc, try_lcore_data + lc);
+			wr++;
+		}
+	}
+
+	if (rc == 0) {
+		RTE_LCORE_FOREACH(lc)
+			print_try_lcore_stats(try_lcore_data + lc, lc);
+
+		if (rd != 0) {
+			printf("aggregated stats for %u RDLOCK cores:\n", rd);
+			print_try_lcore_stats(&rlc, rd);
+		}
+
+		if (wr != 0) {
+			printf("aggregated stats for %u WRLOCK cores:\n", wr);
+			print_try_lcore_stats(&wlc, wr);
+		}
+	}
+
+	return rc;
+}
+
+static void
+try_test_reset(void)
+{
+	memset(&try_lcore_data, 0, sizeof(try_lcore_data));
+	memset(&try_pflock_data, 0, sizeof(try_pflock_data));
+	try_pflock_data.tick = TEST_SEC * rte_get_tsc_hz();
+}
+
+/* all lcores grab RDLOCK */
+static int
+try_pflock_test_rda(void)
+{
+	try_test_reset();
+
+	/* start read test on all avaialble lcores */
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, CALL_MAIN);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* all worker lcores grab RDLOCK, main one grabs WRLOCK */
+static int
+try_pflock_test_rds_wrm(void)
+{
+	try_test_reset();
+
+	rte_eal_mp_remote_launch(try_read_lcore, NULL, SKIP_MAIN);
+	try_write_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+/* main and even worker lcores grab RDLOCK, odd lcores grab WRLOCK */
+static int
+try_pflock_test_rde_wro(void)
+{
+	uint32_t lc, mlc;
+
+	try_test_reset();
+
+	mlc = rte_get_main_lcore();
+
+	RTE_LCORE_FOREACH(lc) {
+		if (lc != mlc) {
+			if ((lc & 1) == 0)
+				rte_eal_remote_launch(try_read_lcore,
+						NULL, lc);
+			else
+				rte_eal_remote_launch(try_write_lcore,
+						NULL, lc);
+		}
+	}
+	try_read_lcore(NULL);
+	rte_eal_mp_wait_lcore();
+
+	return process_try_lcore_stats();
+}
+
+REGISTER_TEST_COMMAND(pflock_autotest, test_pflock);
+
+/* subtests used in meson for CI */
+REGISTER_TEST_COMMAND(pflock_test1_autotest, pflock_test1);
+REGISTER_TEST_COMMAND(pflock_rda_autotest, try_pflock_test_rda);
+REGISTER_TEST_COMMAND(pflock_rds_wrm_autotest, try_pflock_test_rds_wrm);
+REGISTER_TEST_COMMAND(pflock_rde_wro_autotest, try_pflock_test_rde_wro);
diff --git a/lib/librte_eal/arm/include/meson.build b/lib/librte_eal/arm/include/meson.build
index 770766de1a34..2c3cff61bed6 100644
--- a/lib/librte_eal/arm/include/meson.build
+++ b/lib/librte_eal/arm/include/meson.build
@@ -21,6 +21,7 @@  arch_headers = files(
 	'rte_pause_32.h',
 	'rte_pause_64.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch_32.h',
 	'rte_prefetch_64.h',
diff --git a/lib/librte_eal/arm/include/rte_pflock.h b/lib/librte_eal/arm/include/rte_pflock.h
new file mode 100644
index 000000000000..bb9934eec469
--- /dev/null
+++ b/lib/librte_eal/arm/include/rte_pflock.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_ARM_H_
+#define _RTE_PFLOCK_ARM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_ARM_H_ */
diff --git a/lib/librte_eal/include/generic/rte_pflock.h b/lib/librte_eal/include/generic/rte_pflock.h
new file mode 100644
index 000000000000..6808c70c34a2
--- /dev/null
+++ b/lib/librte_eal/include/generic/rte_pflock.h
@@ -0,0 +1,273 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corp.
+ * Copyright 2011-2015 Samy Al Bahra.
+ * All rights reserved.
+ */
+
+#ifndef _RTE_PFLOCK_H_
+#define _RTE_PFLOCK_H_
+
+/**
+ * @file
+ *
+ * Phase-fair locks
+ *
+ * This file defines an API for Phase Fair reader writer locks,
+ * which is a variant of typical reader-writer locks that prevent
+ * starvation. In this type of lock, readers and writers alternate.
+ * This significantly reduces the worst-case blocking for readers and writers.
+ *
+ * This is an implementation derived from FreeBSD
+ * based on the work described in:
+ *    Brandenburg, B. and Anderson, J. 2010. Spin-Based
+ *    Reader-Writer Synchronization for Multiprocessor Real-Time Systems
+ *
+ * All locks must be initialised before use, and only initialised once.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_pause.h>
+
+/**
+ * The rte_pflock_t type.
+ */
+struct rte_pflock {
+	union rte_pflock_ticket {
+		uint32_t tickets;
+		struct {
+			uint16_t in;
+			uint16_t out;
+		};
+	} rd, wr;
+};
+typedef struct rte_pflock rte_pflock_t;
+
+/**
+ * Constants used to map the bits in reader counter
+ *
+ * +-----------------+-+-+
+ * |     Readers     |W|P|
+ * |                 |R|H|
+ * +-----------------+-+-+
+ */
+
+#define RTE_PFLOCK_LSB   0xFFF0
+#define RTE_PFLOCK_RINC  0x100		/* Reader increment value. */
+#define RTE_PFLOCK_WBITS 0x3		/* Writer bits in reader. */
+#define RTE_PFLOCK_PRES  0x2		/* Writer present bit. */
+#define RTE_PFLOCK_PHID  0x1		/* Phase ID bit. */
+
+/**
+ * A static pflock initializer.
+ */
+#define RTE_PFLOCK_INITIALIZER {  }
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Initialize the pflock to an unlocked state.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_init(struct rte_pflock *pf)
+{
+	pf->rd.tickets = 0;
+	pf->wr.tickets = 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take a pflock for read.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_lock(rte_pflock_t *pf)
+{
+	uint32_t w;
+
+	/*
+	 * If no writer is present, then the operation has completed
+	 * successfully.
+	 */
+	w = __atomic_fetch_add(&pf->rd.in, RTE_PFLOCK_RINC, __ATOMIC_ACQ_REL) & RTE_PFLOCK_WBITS;
+	if (w == 0)
+		return;
+
+	/* Wait for current write phase to complete. */
+	while ((__atomic_load_n(&pf->rd.in, __ATOMIC_ACQUIRE) & RTE_PFLOCK_WBITS) == w)
+		rte_pause();
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock locked for reading.
+ *
+ * @param pf
+ *   A pointer to the pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_read_unlock(rte_pflock_t *pf)
+{
+	__atomic_fetch_add(&pf->rd.out, RTE_PFLOCK_RINC, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take a pflock for reading
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for reading because a
+ *     writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_read_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->rd.tickets, __ATOMIC_RELAXED);
+
+	/* loop until writer shows up */
+	while ((old.in & RTE_PFLOCK_WBITS) == 0) {
+		new.out = old.out;
+		new.in = old.in + RTE_PFLOCK_RINC;
+		if (__atomic_compare_exchange_n(&pf->rd.tickets, &old.tickets, new.tickets,
+						0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+			return 0;	/* got it */
+
+		/* either new reader got in (so retry) or a writer */
+	}
+
+	/* If writer is present then we are busy */
+	return -EBUSY;
+}
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the ticketlock.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_lock(rte_pflock_t *pf)
+{
+	uint16_t ticket;
+
+	/* Acquire ownership of write-phase. */
+	ticket = __atomic_fetch_add(&pf->wr.in, 1, __ATOMIC_ACQUIRE);
+	rte_wait_until_equal_16(&pf->wr.out, ticket, __ATOMIC_RELAXED);
+
+	/*
+	 * Acquire ticket on read-side in order to allow them
+	 * to flush. Indicates to any incoming reader that a
+	 * write-phase is pending.
+	 *
+	 * Need ACQUIRE to prevent speculative execution of the wait loop
+	 */
+	ticket = __atomic_fetch_add(&pf->rd.in,
+				    (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				    __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Release a pflock held for writing.
+ *
+ * @param pf
+ *   A pointer to a pflock structure.
+ */
+__rte_experimental
+static inline void
+rte_pflock_write_unlock(rte_pflock_t *pf)
+{
+	/* Migrate from write phase to read phase. */
+	__atomic_fetch_and(&pf->rd.in, RTE_PFLOCK_LSB, __ATOMIC_RELEASE);
+
+	/* Allow other writers to continue. */
+	__atomic_fetch_add(&pf->wr.out, 1, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Try to take the pflock for write.
+ *
+ * @param pf
+ *   A pointer to the pflock.
+ * @return
+ *   - zero if the lock is successfully taken
+ *   - -EBUSY if lock could not be acquired for writing because
+ *     another writer holds the lock
+ */
+__rte_experimental
+static inline int
+rte_pflock_write_trylock(rte_pflock_t *pf)
+{
+	union rte_pflock_ticket old, new;
+	uint16_t ticket;
+
+	/* Get current state of the lock */
+	old.tickets = __atomic_load_n(&pf->wr.tickets, __ATOMIC_RELAXED);
+	new.out = old.out;
+	new.in  = old.in + 1;
+	ticket = new.in;
+
+	/* if writer is already present then too busy */
+	if (old.out != new.in ||
+	    !__atomic_compare_exchange_n(&pf->wr.tickets, &old.tickets, new.tickets,
+					 0, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED))
+		return -EBUSY; /* another writer is present already */
+
+	/*
+	 * We now own the write phase, but still need to tell
+	 * readers and wait for them.
+	 *
+	 * Need ACQUIRE semantics to avoid speculative execution of wait loop
+	 */
+	ticket  = __atomic_fetch_add(&pf->rd.in,
+				 (ticket & RTE_PFLOCK_PHID) | RTE_PFLOCK_PRES,
+				 __ATOMIC_ACQUIRE);
+
+	/* Wait for any pending readers to flush. */
+	rte_wait_until_equal_16(&pf->rd.out, ticket, __ATOMIC_RELAXED);
+	return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PFLOCK_H */
diff --git a/lib/librte_eal/ppc/include/meson.build b/lib/librte_eal/ppc/include/meson.build
index dae40ede546e..7692a531ccba 100644
--- a/lib/librte_eal/ppc/include/meson.build
+++ b/lib/librte_eal/ppc/include/meson.build
@@ -11,6 +11,7 @@  arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rwlock.h',
diff --git a/lib/librte_eal/ppc/include/rte_pflock.h b/lib/librte_eal/ppc/include/rte_pflock.h
new file mode 100644
index 000000000000..e7b875ac56a8
--- /dev/null
+++ b/lib/librte_eal/ppc/include/rte_pflock.h
@@ -0,0 +1,16 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ */
+#ifndef _RTE_PFLOCK_PPC_64_H_
+#define _RTE_PFLOCK_PPC_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_PPC_64_H_ */
diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build
index 1a6ad0b17342..f43645c20899 100644
--- a/lib/librte_eal/x86/include/meson.build
+++ b/lib/librte_eal/x86/include/meson.build
@@ -10,6 +10,7 @@  arch_headers = files(
 	'rte_mcslock.h',
 	'rte_memcpy.h',
 	'rte_pause.h',
+	'rte_pflock.h',
 	'rte_power_intrinsics.h',
 	'rte_prefetch.h',
 	'rte_rtm.h',
diff --git a/lib/librte_eal/x86/include/rte_pflock.h b/lib/librte_eal/x86/include/rte_pflock.h
new file mode 100644
index 000000000000..c2d876062c08
--- /dev/null
+++ b/lib/librte_eal/x86/include/rte_pflock.h
@@ -0,0 +1,18 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Microsoft Corporation
+ */
+
+#ifndef _RTE_PFLOCK_X86_64_H_
+#define _RTE_PFLOCK_X86_64_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pflock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PFLOCK_X86_64_H_ */