net/af_packet: fix statistics

Message ID 20240430154129.7347-1-stephen@networkplumber.org (mailing list archive)
State New
Delegated to: Ferruh Yigit
Headers
Series net/af_packet: fix statistics |

Checks

Context Check Description
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/github-robot: build success github build: passed
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-unit-arm64-testing fail Testing issues
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS

Commit Message

Stephen Hemminger April 30, 2024, 3:39 p.m. UTC
  The statistics in af_packet driver do not follow the standard
practice of other drivers:

 - Statistics should be maintained as 64 bit even on 32 bit.

 - Remove the tx_error counter since it was not correct.
   When transmit ring is full it is not an error and
   the driver correctly returns only the number sent.

 - Query kernel to find the number of packets missed.

 - Do not mark statistics as volatile.
   Instead, READ_ONCE() where necessary.

Also, the variable namge igb_stats looks like a copy/paste leftover

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/af_packet/rte_eth_af_packet.c | 82 +++++++++++++----------
 1 file changed, 48 insertions(+), 34 deletions(-)
  

Comments

Ferruh Yigit May 1, 2024, 4:25 p.m. UTC | #1
On 4/30/2024 4:39 PM, Stephen Hemminger wrote:
> The statistics in af_packet driver do not follow the standard
> practice of other drivers:
> 
>  - Statistics should be maintained as 64 bit even on 32 bit.
> 

ack

>  - Remove the tx_error counter since it was not correct.
>    When transmit ring is full it is not an error and
>    the driver correctly returns only the number sent.
> 

nack
Transmit full is not only return case here.
There are actual errors continue to process relying this error calculation.
Also there are error cases like interface down.
Those error cases should be handled individually if we remove this.
I suggest split this change to separate patch.

>  - Query kernel to find the number of packets missed.
> 

ack.

>  - Do not mark statistics as volatile.
>    Instead, READ_ONCE() where necessary.
> 

I did similar [1], and Mattias has some comments on it.
Issue is not in the reader (stats_get) side. Without volatile writer
(datapath thread) may end up *not* storing updated stats to memory.

For reader side, I expect value not been in the register when function
called, so it ends up reading from memory, which doesn't require
volatile casting.

[1]
https://patchwork.dpdk.org/project/dpdk/patch/20240426143848.2280689-1-ferruh.yigit@amd.com/

> Also, the variable namge igb_stats looks like a copy/paste leftover
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  drivers/net/af_packet/rte_eth_af_packet.c | 82 +++++++++++++----------
>  1 file changed, 48 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
> index 397a32db58..0313aee482 100644
> --- a/drivers/net/af_packet/rte_eth_af_packet.c
> +++ b/drivers/net/af_packet/rte_eth_af_packet.c
> @@ -39,6 +39,10 @@
>  #define DFLT_FRAME_SIZE		(1 << 11)
>  #define DFLT_FRAME_COUNT	(1 << 9)
>  
> +#ifndef READ_ONCE
> +#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
> +#endif
> +
>  struct pkt_rx_queue {
>  	int sockfd;
>  
> @@ -51,8 +55,8 @@ struct pkt_rx_queue {
>  	uint16_t in_port;
>  	uint8_t vlan_strip;
>  
> -	volatile unsigned long rx_pkts;
> -	volatile unsigned long rx_bytes;
> +	uint64_t rx_pkts;
> +	uint64_t rx_bytes;
>  };
>  
>  struct pkt_tx_queue {
> @@ -64,9 +68,8 @@ struct pkt_tx_queue {
>  	unsigned int framecount;
>  	unsigned int framenum;
>  
> -	volatile unsigned long tx_pkts;
> -	volatile unsigned long err_pkts;
> -	volatile unsigned long tx_bytes;
> +	uint64_t tx_pkts;
> +	uint64_t tx_bytes;
>  };
>  
>  struct pmd_internals {
> @@ -305,7 +308,6 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
>  
>  	pkt_q->framenum = framenum;
>  	pkt_q->tx_pkts += num_tx;
> -	pkt_q->err_pkts += i - num_tx;
>  	pkt_q->tx_bytes += num_tx_bytes;
>  	return i;
>  }
> @@ -386,54 +388,66 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
>  }
>  
>  static int
> -eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
>  {
> -	unsigned i, imax;
> -	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
> -	unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
> +	unsigned int i;
>  	const struct pmd_internals *internal = dev->data->dev_private;
> +	uint64_t bytes, packets;
> +
> +	for (i = 0; i < internal->nb_queues; i++) {
> +		const struct pkt_rx_queue *rxq = &internal->rx_queue[i];
> +		struct tpacket_stats pkt_stats;
> +		socklen_t optlen = sizeof(pkt_stats);
> +		int fd = rxq->sockfd;
> +
> +		bytes = READ_ONCE(rxq->rx_bytes);
> +		packets = READ_ONCE(rxq->rx_pkts);
>  
> -	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
> -	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
> -	for (i = 0; i < imax; i++) {
> -		igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
> -		igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
> -		rx_total += igb_stats->q_ipackets[i];
> -		rx_bytes_total += igb_stats->q_ibytes[i];
> +		stats->ipackets += packets;
> +		stats->ibytes += bytes;
> +
> +		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
> +			stats->q_ipackets[i] = packets;
> +			stats->q_ibytes[i] = bytes;
> +		}
> +
> +		if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &pkt_stats, &optlen) < 0) {
> +			PMD_LOG_ERRNO(ERR, "could not getet PACKET_STATISTICS on AF_PACKET socket");
> +			return -1;
> +		}
> +		stats->imissed = pkt_stats.tp_drops;
>  	}
>  
> -	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
> -	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
> -	for (i = 0; i < imax; i++) {
> -		igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
> -		igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
> -		tx_total += igb_stats->q_opackets[i];
> -		tx_err_total += internal->tx_queue[i].err_pkts;
> -		tx_bytes_total += igb_stats->q_obytes[i];
> +	for (i = 0; i < internal->nb_queues; i++) {
> +		const struct pkt_tx_queue *txq = &internal->tx_queue[i];
> +
> +		bytes = READ_ONCE(txq->tx_bytes);
> +		packets = READ_ONCE(txq->tx_pkts);
> +
> +		stats->opackets += packets;
> +		stats->obytes += bytes;
> +
> +		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
> +			stats->q_opackets[i] = packets;
> +			stats->q_obytes[i] = bytes;
> +		}
>  	}
>  
> -	igb_stats->ipackets = rx_total;
> -	igb_stats->ibytes = rx_bytes_total;
> -	igb_stats->opackets = tx_total;
> -	igb_stats->oerrors = tx_err_total;
> -	igb_stats->obytes = tx_bytes_total;
> +
>  	return 0;
>  }
>  
>  static int
>  eth_stats_reset(struct rte_eth_dev *dev)
>  {
> -	unsigned i;
> +	unsigned int i;
>  	struct pmd_internals *internal = dev->data->dev_private;
>  
>  	for (i = 0; i < internal->nb_queues; i++) {
>  		internal->rx_queue[i].rx_pkts = 0;
>  		internal->rx_queue[i].rx_bytes = 0;
> -	}
>  
> -	for (i = 0; i < internal->nb_queues; i++) {
>  		internal->tx_queue[i].tx_pkts = 0;
> -		internal->tx_queue[i].err_pkts = 0;
>  		internal->tx_queue[i].tx_bytes = 0;
>  	}
>
  
Stephen Hemminger May 1, 2024, 4:42 p.m. UTC | #2
On Wed, 1 May 2024 17:25:59 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> On 4/30/2024 4:39 PM, Stephen Hemminger wrote:
> > The statistics in af_packet driver do not follow the standard
> > practice of other drivers:
> > 
> >  - Statistics should be maintained as 64 bit even on 32 bit.
> >   
> 
> ack
> 
> >  - Remove the tx_error counter since it was not correct.
> >    When transmit ring is full it is not an error and
> >    the driver correctly returns only the number sent.
> >   
> 
> nack
> Transmit full is not only return case here.
> There are actual errors continue to process relying this error calculation.
> Also there are error cases like interface down.
> Those error cases should be handled individually if we remove this.
> I suggest split this change to separate patch.

It is possible to get errors, but the code wasn't looking for those.
See packet(7) man page.
  
Stephen Hemminger May 1, 2024, 4:43 p.m. UTC | #3
On Wed, 1 May 2024 17:25:59 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> >  - Do not mark statistics as volatile.
> >    Instead, READ_ONCE() where necessary.
> >   
> 
> I did similar [1], and Mattias has some comments on it.
> Issue is not in the reader (stats_get) side. Without volatile writer
> (datapath thread) may end up *not* storing updated stats to memory.
> 
> For reader side, I expect value not been in the register when function
> called, so it ends up reading from memory, which doesn't require
> volatile casting.

These are per-queue stats, and anybody foolish enough to have multiple
threads sharing same queue without locking is playing with fire.
It is documented that DPDK PMD's don't support that.
  
Stephen Hemminger May 1, 2024, 4:44 p.m. UTC | #4
On Wed, 1 May 2024 17:25:59 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> >  - Remove the tx_error counter since it was not correct.
> >    When transmit ring is full it is not an error and
> >    the driver correctly returns only the number sent.
> >   
> 
> nack
> Transmit full is not only return case here.
> There are actual errors continue to process relying this error calculation.
> Also there are error cases like interface down.
> Those error cases should be handled individually if we remove this.
> I suggest split this change to separate patch.

I see multiple drivers have copy/pasted same code and consider
transmit full as an error. It is not.

There should be a new statistic at ethdev layer that does record
transmit full, and make it across all drivers, but that would have
to wait for ABI change.
  
Morten Brørup May 1, 2024, 6:18 p.m. UTC | #5
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, 1 May 2024 18.45
> 
> On Wed, 1 May 2024 17:25:59 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
> > >  - Remove the tx_error counter since it was not correct.
> > >    When transmit ring is full it is not an error and
> > >    the driver correctly returns only the number sent.
> > >
> >
> > nack
> > Transmit full is not only return case here.
> > There are actual errors continue to process relying this error
> calculation.
> > Also there are error cases like interface down.
> > Those error cases should be handled individually if we remove this.
> > I suggest split this change to separate patch.
> 
> I see multiple drivers have copy/pasted same code and consider
> transmit full as an error. It is not.

+1
Transmit full is certainly not an error!

> 
> There should be a new statistic at ethdev layer that does record
> transmit full, and make it across all drivers, but that would have
> to wait for ABI change.

What happens to these non-transmittable packets depend on the application.
Our application discards them and count them in a (per-port, per-queue) application level counter tx_nodescr, which eventually becomes IF-MIB::ifOutDiscards in SNMP. I think many applications behave similarly, so having an ethdev layer tx_nodescr counter might be helpful.
Other applications could try to retransmit them; if there are still no TX descriptors, they will be counted again.

In case anyone gets funny ideas: The PMD should still not free those non-transmitted packet mbufs, because the application might want to treat them differently than the transmitted packets, e.g. for latency stats or packet capture.
  
Ferruh Yigit May 2, 2024, 1:47 p.m. UTC | #6
On 5/1/2024 7:18 PM, Morten Brørup wrote:
>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>> Sent: Wednesday, 1 May 2024 18.45
>>
>> On Wed, 1 May 2024 17:25:59 +0100
>> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
>>
>>>>  - Remove the tx_error counter since it was not correct.
>>>>    When transmit ring is full it is not an error and
>>>>    the driver correctly returns only the number sent.
>>>>
>>>
>>> nack
>>> Transmit full is not only return case here.
>>> There are actual errors continue to process relying this error
>> calculation.
>>> Also there are error cases like interface down.
>>> Those error cases should be handled individually if we remove this.
>>> I suggest split this change to separate patch.
>>
>> I see multiple drivers have copy/pasted same code and consider
>> transmit full as an error. It is not.
> 
> +1
> Transmit full is certainly not an error!
> 

I am not referring to the transmit full case, there are error cases in
the driver:
- oversized packets
- vlan inserting failure

In above cases Tx loop continues, which relies at the end of the loop
these packets will be counted as error. We can't just remove error
counter, need to handle above.


- poll on fd fails
- poll on fd returns POLLERR (if down)

In above cases driver Tx loop breaks and all remaining packets counted
as error.


- sendto() fails

All packets sent to af_packet frame counted as error.


As you can see there are real error cases which are handled in the driver.
That is why instead of just removing error counter, I suggest handle it
more properly in a separate patch.

>>
>> There should be a new statistic at ethdev layer that does record
>> transmit full, and make it across all drivers, but that would have
>> to wait for ABI change.
> 
> What happens to these non-transmittable packets depend on the application.
> Our application discards them and count them in a (per-port, per-queue) application level counter tx_nodescr, which eventually becomes IF-MIB::ifOutDiscards in SNMP. I think many applications behave similarly, so having an ethdev layer tx_nodescr counter might be helpful.
> Other applications could try to retransmit them; if there are still no TX descriptors, they will be counted again.
> 
> In case anyone gets funny ideas: The PMD should still not free those non-transmitted packet mbufs, because the application might want to treat them differently than the transmitted packets, e.g. for latency stats or packet capture.
>
  
Ferruh Yigit May 2, 2024, 1:48 p.m. UTC | #7
On 5/1/2024 5:42 PM, Stephen Hemminger wrote:
> On Wed, 1 May 2024 17:25:59 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>> On 4/30/2024 4:39 PM, Stephen Hemminger wrote:
>>> The statistics in af_packet driver do not follow the standard
>>> practice of other drivers:
>>>
>>>  - Statistics should be maintained as 64 bit even on 32 bit.
>>>   
>>
>> ack
>>
>>>  - Remove the tx_error counter since it was not correct.
>>>    When transmit ring is full it is not an error and
>>>    the driver correctly returns only the number sent.
>>>   
>>
>> nack
>> Transmit full is not only return case here.
>> There are actual errors continue to process relying this error calculation.
>> Also there are error cases like interface down.
>> Those error cases should be handled individually if we remove this.
>> I suggest split this change to separate patch.
> 
> It is possible to get errors, but the code wasn't looking for those.
> See packet(7) man page.
>

Some of the error cases are in dpdk level, so we can't just rely on
af_packet socket error counters.
  
Ferruh Yigit May 2, 2024, 2:12 p.m. UTC | #8
On 5/1/2024 5:43 PM, Stephen Hemminger wrote:
> On Wed, 1 May 2024 17:25:59 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>>>  - Do not mark statistics as volatile.
>>>    Instead, READ_ONCE() where necessary.
>>>   
>>
>> I did similar [1], and Mattias has some comments on it.
>> Issue is not in the reader (stats_get) side. Without volatile writer
>> (datapath thread) may end up *not* storing updated stats to memory.
>>
>> For reader side, I expect value not been in the register when function
>> called, so it ends up reading from memory, which doesn't require
>> volatile casting.
> 
> These are per-queue stats, and anybody foolish enough to have multiple
> threads sharing same queue without locking is playing with fire.
> It is documented that DPDK PMD's don't support that.
>

I am not referring multiple core sharing a queue, this is wrong for DPDK.

For single core case, if a variable doesn't have 'volatile' qualifier
and load/stores are not atomic, as compiler is not aware that there may
be other threads will access this value, what prevents compiler to hold
stats value in a register and store it memory at later stage, let say at
the end of the application forwarding loop?

For this case keeping 'volatile' qualifier works. But Mattias has a
suggestion to use "normal load + normal add + atomic store" in datapath.


For getting stats, which will be in different thread, you are already
casting it to 'volatile' to enforce compiler read from memory each time.


Additionally when we take stats reset into account, which can happen in
different thread, 'volatile' is not enough.
For reliable stats reset, either we need to use full atomics, or offset
logic which I am trying to in other patch.
  
Stephen Hemminger May 2, 2024, 4:16 p.m. UTC | #9
On Thu, 2 May 2024 15:12:43 +0100
Ferruh Yigit <ferruh.yigit@amd.com> wrote:

> I am not referring multiple core sharing a queue, this is wrong for DPDK.
> 
> For single core case, if a variable doesn't have 'volatile' qualifier
> and load/stores are not atomic, as compiler is not aware that there may
> be other threads will access this value, what prevents compiler to hold
> stats value in a register and store it memory at later stage, let say at
> the end of the application forwarding loop?

No other driver does this; it is not a problem since once rx/tx burst
function returns, there is no state the compiler is aware of.


> For this case keeping 'volatile' qualifier works. But Mattias has a
> suggestion to use "normal load + normal add + atomic store" in datapath.
> 
> 
> For getting stats, which will be in different thread, you are already
> casting it to 'volatile' to enforce compiler read from memory each time.
> 
> 
> Additionally when we take stats reset into account, which can happen in
> different thread, 'volatile' is not enough.
> For reliable stats reset, either we need to use full atomics, or offset
> logic which I am trying to in other patch.

If you want to attack the problem in the general case, there should be
standard set of functions for handling per-core stats.  If you look at
Linux kernel you will see macros around this function:

netdev_core_stats_inc((struct net_device *dev, u32 offset) {
	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
	unsigned long __percpu *field;
...
	field = (__force unsigned long __percpu *)((__force void *)p + offset);
	this_cpu_inc(*field);
}

The expansion of this_cpu_inc is nested but ends up being:
	*field += 1;

No volatile is needed.

The conclusion I reach from this is:
	- atomic is not needed for per-cpu data. See "volatile considered harmful"
          using a form of compiler barrier is useful but should be hidden in generic code.
	- what is the state of the better per-lcore patches?
          that should be used here.
        - need a more generic percpu stats infrastructure
        - ethedev needs to have a generic percpu infra and not reinvented in xdp, tap, packet, etc.
  
Mattias Rönnblom May 2, 2024, 5:57 p.m. UTC | #10
On 2024-05-02 18:16, Stephen Hemminger wrote:
> On Thu, 2 May 2024 15:12:43 +0100
> Ferruh Yigit <ferruh.yigit@amd.com> wrote:
> 
>> I am not referring multiple core sharing a queue, this is wrong for DPDK.
>>
>> For single core case, if a variable doesn't have 'volatile' qualifier
>> and load/stores are not atomic, as compiler is not aware that there may
>> be other threads will access this value, what prevents compiler to hold
>> stats value in a register and store it memory at later stage, let say at
>> the end of the application forwarding loop?
> 
> No other driver does this; it is not a problem since once rx/tx burst
> function returns, there is no state the compiler is aware of.
> 
> 
>> For this case keeping 'volatile' qualifier works. But Mattias has a
>> suggestion to use "normal load + normal add + atomic store" in datapath.
>>
>>
>> For getting stats, which will be in different thread, you are already
>> casting it to 'volatile' to enforce compiler read from memory each time.
>>
>>
>> Additionally when we take stats reset into account, which can happen in
>> different thread, 'volatile' is not enough.
>> For reliable stats reset, either we need to use full atomics, or offset
>> logic which I am trying to in other patch.
> 
> If you want to attack the problem in the general case, there should be
> standard set of functions for handling per-core stats.  If you look at
> Linux kernel you will see macros around this function:
> 
> netdev_core_stats_inc((struct net_device *dev, u32 offset) {
> 	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
> 	unsigned long __percpu *field;
> ...
> 	field = (__force unsigned long __percpu *)((__force void *)p + offset);
> 	this_cpu_inc(*field);
> }
> 
> The expansion of this_cpu_inc is nested but ends up being:
> 	*field += 1;
> 
> No volatile is needed.
> 
> The conclusion I reach from this is:
> 	- atomic is not needed for per-cpu data. See "volatile considered harmful"
>            using a form of compiler barrier is useful but should be hidden in generic code.
> 	- what is the state of the better per-lcore patches?
>            that should be used here.
>          - need a more generic percpu stats infrastructure
>          - ethedev needs to have a generic percpu infra and not reinvented in xdp, tap, packet, etc.
> 
> 

In this case, the counters are already effectively "per-lcore", since a 
single RX/TX queue pair is not MT safe, and thus is only used by one 
thread (or at least, by one thread at a time).

My impression is that both the Linux kernel and DPDK tend to assume that 
no load or store tearing occurs for accesses to small, naturally 
aligned, integer types. I'm guessing it's also commonly assumed there 
are limits to how large section of the whole program the compiler may 
reason about, to eliminate data and code/instructions that have no 
effect on that thread. Thus, volatile are not needed to assure that for 
example counter state is actually maintained, and updates actually occurs.

I will continue work on the lcore variables patch set as soon as time 
permits. That will be after my attempts to complete the bitops and the 
bitset patch sets have concluded.
  

Patch

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 397a32db58..0313aee482 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -39,6 +39,10 @@ 
 #define DFLT_FRAME_SIZE		(1 << 11)
 #define DFLT_FRAME_COUNT	(1 << 9)
 
+#ifndef READ_ONCE
+#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var))))
+#endif
+
 struct pkt_rx_queue {
 	int sockfd;
 
@@ -51,8 +55,8 @@  struct pkt_rx_queue {
 	uint16_t in_port;
 	uint8_t vlan_strip;
 
-	volatile unsigned long rx_pkts;
-	volatile unsigned long rx_bytes;
+	uint64_t rx_pkts;
+	uint64_t rx_bytes;
 };
 
 struct pkt_tx_queue {
@@ -64,9 +68,8 @@  struct pkt_tx_queue {
 	unsigned int framecount;
 	unsigned int framenum;
 
-	volatile unsigned long tx_pkts;
-	volatile unsigned long err_pkts;
-	volatile unsigned long tx_bytes;
+	uint64_t tx_pkts;
+	uint64_t tx_bytes;
 };
 
 struct pmd_internals {
@@ -305,7 +308,6 @@  eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 	pkt_q->framenum = framenum;
 	pkt_q->tx_pkts += num_tx;
-	pkt_q->err_pkts += i - num_tx;
 	pkt_q->tx_bytes += num_tx_bytes;
 	return i;
 }
@@ -386,54 +388,66 @@  eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 }
 
 static int
-eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 {
-	unsigned i, imax;
-	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
-	unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
+	unsigned int i;
 	const struct pmd_internals *internal = dev->data->dev_private;
+	uint64_t bytes, packets;
+
+	for (i = 0; i < internal->nb_queues; i++) {
+		const struct pkt_rx_queue *rxq = &internal->rx_queue[i];
+		struct tpacket_stats pkt_stats;
+		socklen_t optlen = sizeof(pkt_stats);
+		int fd = rxq->sockfd;
+
+		bytes = READ_ONCE(rxq->rx_bytes);
+		packets = READ_ONCE(rxq->rx_pkts);
 
-	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
-	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
-	for (i = 0; i < imax; i++) {
-		igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
-		igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
-		rx_total += igb_stats->q_ipackets[i];
-		rx_bytes_total += igb_stats->q_ibytes[i];
+		stats->ipackets += packets;
+		stats->ibytes += bytes;
+
+		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			stats->q_ipackets[i] = packets;
+			stats->q_ibytes[i] = bytes;
+		}
+
+		if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &pkt_stats, &optlen) < 0) {
+			PMD_LOG_ERRNO(ERR, "could not getet PACKET_STATISTICS on AF_PACKET socket");
+			return -1;
+		}
+		stats->imissed = pkt_stats.tp_drops;
 	}
 
-	imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ?
-	        internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS);
-	for (i = 0; i < imax; i++) {
-		igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
-		igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
-		tx_total += igb_stats->q_opackets[i];
-		tx_err_total += internal->tx_queue[i].err_pkts;
-		tx_bytes_total += igb_stats->q_obytes[i];
+	for (i = 0; i < internal->nb_queues; i++) {
+		const struct pkt_tx_queue *txq = &internal->tx_queue[i];
+
+		bytes = READ_ONCE(txq->tx_bytes);
+		packets = READ_ONCE(txq->tx_pkts);
+
+		stats->opackets += packets;
+		stats->obytes += bytes;
+
+		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			stats->q_opackets[i] = packets;
+			stats->q_obytes[i] = bytes;
+		}
 	}
 
-	igb_stats->ipackets = rx_total;
-	igb_stats->ibytes = rx_bytes_total;
-	igb_stats->opackets = tx_total;
-	igb_stats->oerrors = tx_err_total;
-	igb_stats->obytes = tx_bytes_total;
+
 	return 0;
 }
 
 static int
 eth_stats_reset(struct rte_eth_dev *dev)
 {
-	unsigned i;
+	unsigned int i;
 	struct pmd_internals *internal = dev->data->dev_private;
 
 	for (i = 0; i < internal->nb_queues; i++) {
 		internal->rx_queue[i].rx_pkts = 0;
 		internal->rx_queue[i].rx_bytes = 0;
-	}
 
-	for (i = 0; i < internal->nb_queues; i++) {
 		internal->tx_queue[i].tx_pkts = 0;
-		internal->tx_queue[i].err_pkts = 0;
 		internal->tx_queue[i].tx_bytes = 0;
 	}