net/pcap: reduce time for stopping device

Message ID 20220825072041.10768-1-yidingx.zhou@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series net/pcap: reduce time for stopping device |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-x86_64-unit-testing fail Testing issues
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/intel-Testing success Testing PASS

Commit Message

Yiding Zhou Aug. 25, 2022, 7:20 a.m. UTC
  The pcap file will be synchronized to the disk when stopping the device.
It takes a long time if the file is large that would cause the
'detach sync request' timeout when the device is closed under multi-process
scenario.

This commit fixes the issue by performing synchronization in Tx path

Fixes: 4c173302c307 ("pcap: add new driver")
Cc: stable@dpdk.org

Signed-off-by: Yiding Zhou <yidingx.zhou@intel.com>
---
 drivers/net/pcap/pcap_ethdev.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)
  

Comments

Ferruh Yigit Aug. 25, 2022, 10:09 a.m. UTC | #1
On 8/25/2022 8:20 AM, Yiding Zhou wrote:
> The pcap file will be synchronized to the disk when stopping the device.
> It takes a long time if the file is large that would cause the
> 'detach sync request' timeout when the device is closed under multi-process
> scenario.
> 
> This commit fixes the issue by performing synchronization in Tx path
> 
> Fixes: 4c173302c307 ("pcap: add new driver")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Yiding Zhou <yidingx.zhou@intel.com>
> ---
>   drivers/net/pcap/pcap_ethdev.c | 18 ++++++++++++++++--
>   1 file changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/pcap/pcap_ethdev.c b/drivers/net/pcap/pcap_ethdev.c
> index ec29fd6bc5..52eafa5674 100644
> --- a/drivers/net/pcap/pcap_ethdev.c
> +++ b/drivers/net/pcap/pcap_ethdev.c
> @@ -3,7 +3,7 @@
>    * Copyright(c) 2014 6WIND S.A.
>    * All rights reserved.
>    */
> -
> +#include <unistd.h>
>   #include <time.h>
>   
>   #include <pcap.h>
> @@ -38,6 +38,8 @@
>   
>   #define RTE_PMD_PCAP_MAX_QUEUES 16
>   
> +#define ETH_PCAP_SYNC_THRESHOLD 0x20000000
> +
>   static char errbuf[PCAP_ERRBUF_SIZE];
>   static struct timespec start_time;
>   static uint64_t start_cycles;
> @@ -47,6 +49,8 @@ static uint8_t iface_idx;
>   static uint64_t timestamp_rx_dynflag;
>   static int timestamp_dynfield_offset = -1;
>   
> +RTE_DEFINE_PER_LCORE(uint64_t, _pcap_cached_bytes);
> +
>   struct queue_stat {
>   	volatile unsigned long pkts;
>   	volatile unsigned long bytes;
> @@ -144,6 +148,16 @@ static struct rte_eth_link pmd_link = {
>   
>   RTE_LOG_REGISTER_DEFAULT(eth_pcap_logtype, NOTICE);
>   
> +static inline void
> +pcap_dumper_data_sync(pcap_dumper_t *dumper, uint32_t bytes)
> +{
> +	RTE_PER_LCORE(_pcap_cached_bytes) += bytes;
> +	if (unlikely(RTE_PER_LCORE(_pcap_cached_bytes) > ETH_PCAP_SYNC_THRESHOLD)) {
> +		if (!fdatasync(fileno(pcap_dump_file(dumper))))
> +			RTE_PER_LCORE(_pcap_cached_bytes) = 0;
> +	}
> +}
> +
>   static struct queue_missed_stat*
>   queue_missed_stat_update(struct rte_eth_dev *dev, unsigned int qid)
>   {
> @@ -421,7 +435,7 @@ eth_pcap_tx_dumper(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
>   	 * process stops and to make sure the pcap file is actually written,
>   	 * we flush the pcap dumper within each burst.
>   	 */
> -	pcap_dump_flush(dumper);
> +	pcap_dumper_data_sync(dumper, tx_bytes);

'pcap_dump_flush()' should be doing the same thing, to write buffer to 
file, isn't it working?

Can you check the return value of the 'pcap_dump_flush()' API, I wonder 
if it keeps failing, for some reason?

>   	dumper_q->tx_stat.pkts += num_tx;
>   	dumper_q->tx_stat.bytes += tx_bytes;
>   	dumper_q->tx_stat.err_pkts += nb_pkts - num_tx;
  
Yiding Zhou Aug. 25, 2022, 11:17 a.m. UTC | #2
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> Sent: Thursday, August 25, 2022 6:09 PM
> To: Zhou, YidingX <yidingx.zhou@intel.com>; dev@dpdk.org
> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; stable@dpdk.org
> Subject: Re: [PATCH] net/pcap: reduce time for stopping device
> 
> On 8/25/2022 8:20 AM, Yiding Zhou wrote:
> > The pcap file will be synchronized to the disk when stopping the device.
> > It takes a long time if the file is large that would cause the 'detach
> > sync request' timeout when the device is closed under multi-process
> > scenario.
> >
> > This commit fixes the issue by performing synchronization in Tx path
> >
> > Fixes: 4c173302c307 ("pcap: add new driver")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Yiding Zhou <yidingx.zhou@intel.com>
> > ---
> >   drivers/net/pcap/pcap_ethdev.c | 18 ++++++++++++++++--
> >   1 file changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/pcap/pcap_ethdev.c
> > b/drivers/net/pcap/pcap_ethdev.c index ec29fd6bc5..52eafa5674 100644
> > --- a/drivers/net/pcap/pcap_ethdev.c
> > +++ b/drivers/net/pcap/pcap_ethdev.c
> > @@ -3,7 +3,7 @@
> >    * Copyright(c) 2014 6WIND S.A.
> >    * All rights reserved.
> >    */
> > -
> > +#include <unistd.h>
> >   #include <time.h>
> >
> >   #include <pcap.h>
> > @@ -38,6 +38,8 @@
> >
> >   #define RTE_PMD_PCAP_MAX_QUEUES 16
> >
> > +#define ETH_PCAP_SYNC_THRESHOLD 0x20000000
> > +
> >   static char errbuf[PCAP_ERRBUF_SIZE];
> >   static struct timespec start_time;
> >   static uint64_t start_cycles;
> > @@ -47,6 +49,8 @@ static uint8_t iface_idx;
> >   static uint64_t timestamp_rx_dynflag;
> >   static int timestamp_dynfield_offset = -1;
> >
> > +RTE_DEFINE_PER_LCORE(uint64_t, _pcap_cached_bytes);
> > +
> >   struct queue_stat {
> >   	volatile unsigned long pkts;
> >   	volatile unsigned long bytes;
> > @@ -144,6 +148,16 @@ static struct rte_eth_link pmd_link = {
> >
> >   RTE_LOG_REGISTER_DEFAULT(eth_pcap_logtype, NOTICE);
> >
> > +static inline void
> > +pcap_dumper_data_sync(pcap_dumper_t *dumper, uint32_t bytes) {
> > +	RTE_PER_LCORE(_pcap_cached_bytes) += bytes;
> > +	if (unlikely(RTE_PER_LCORE(_pcap_cached_bytes) >
> ETH_PCAP_SYNC_THRESHOLD)) {
> > +		if (!fdatasync(fileno(pcap_dump_file(dumper))))
> > +			RTE_PER_LCORE(_pcap_cached_bytes) = 0;
> > +	}
> > +}
> > +
> >   static struct queue_missed_stat*
> >   queue_missed_stat_update(struct rte_eth_dev *dev, unsigned int qid)
> >   {
> > @@ -421,7 +435,7 @@ eth_pcap_tx_dumper(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> >   	 * process stops and to make sure the pcap file is actually written,
> >   	 * we flush the pcap dumper within each burst.
> >   	 */
> > -	pcap_dump_flush(dumper);
> > +	pcap_dumper_data_sync(dumper, tx_bytes);
> 
> 'pcap_dump_flush()' should be doing the same thing, to write buffer to file,
> isn't it working?
> 
> Can you check the return value of the 'pcap_dump_flush()' API, I wonder if it
> keeps failing, for some reason?
> 

'pcap_dump_flush()' returns 0 each time without error, it calls 'fflush()' to flush userspace buffers to kernel buffers, not disk. 'fdatasync()' to ensure data is written to disk.

> >   	dumper_q->tx_stat.pkts += num_tx;
> >   	dumper_q->tx_stat.bytes += tx_bytes;
> >   	dumper_q->tx_stat.err_pkts += nb_pkts - num_tx;
  
Ferruh Yigit Aug. 25, 2022, 12:21 p.m. UTC | #3
On 8/25/2022 12:17 PM, Zhou, YidingX wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
>> Sent: Thursday, August 25, 2022 6:09 PM
>> To: Zhou, YidingX <yidingx.zhou@intel.com>; dev@dpdk.org
>> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; stable@dpdk.org
>> Subject: Re: [PATCH] net/pcap: reduce time for stopping device
>>
>> On 8/25/2022 8:20 AM, Yiding Zhou wrote:
>>> The pcap file will be synchronized to the disk when stopping the device.
>>> It takes a long time if the file is large that would cause the 'detach
>>> sync request' timeout when the device is closed under multi-process
>>> scenario.
>>>
>>> This commit fixes the issue by performing synchronization in Tx path
>>>
>>> Fixes: 4c173302c307 ("pcap: add new driver")
>>> Cc: stable@dpdk.org
>>>
>>> Signed-off-by: Yiding Zhou <yidingx.zhou@intel.com>
>>> ---
>>>    drivers/net/pcap/pcap_ethdev.c | 18 ++++++++++++++++--
>>>    1 file changed, 16 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/net/pcap/pcap_ethdev.c
>>> b/drivers/net/pcap/pcap_ethdev.c index ec29fd6bc5..52eafa5674 100644
>>> --- a/drivers/net/pcap/pcap_ethdev.c
>>> +++ b/drivers/net/pcap/pcap_ethdev.c
>>> @@ -3,7 +3,7 @@
>>>     * Copyright(c) 2014 6WIND S.A.
>>>     * All rights reserved.
>>>     */
>>> -
>>> +#include <unistd.h>
>>>    #include <time.h>
>>>
>>>    #include <pcap.h>
>>> @@ -38,6 +38,8 @@
>>>
>>>    #define RTE_PMD_PCAP_MAX_QUEUES 16
>>>
>>> +#define ETH_PCAP_SYNC_THRESHOLD 0x20000000
>>> +

I guess this is 512MB, can you please comment this.
Is there any specific reason to select this value, or is it arbitrary?


>>>    static char errbuf[PCAP_ERRBUF_SIZE];
>>>    static struct timespec start_time;
>>>    static uint64_t start_cycles;
>>> @@ -47,6 +49,8 @@ static uint8_t iface_idx;
>>>    static uint64_t timestamp_rx_dynflag;
>>>    static int timestamp_dynfield_offset = -1;
>>>
>>> +RTE_DEFINE_PER_LCORE(uint64_t, _pcap_cached_bytes);
>>> +
>>>    struct queue_stat {
>>>    	volatile unsigned long pkts;
>>>    	volatile unsigned long bytes;
>>> @@ -144,6 +148,16 @@ static struct rte_eth_link pmd_link = {
>>>
>>>    RTE_LOG_REGISTER_DEFAULT(eth_pcap_logtype, NOTICE);
>>>
>>> +static inline void
>>> +pcap_dumper_data_sync(pcap_dumper_t *dumper, uint32_t bytes) {

'pcap_' is the namespace for the libpcap, can you select another prefix, 
like 'eth_pcap_' as many driver functions does.

>>> +	RTE_PER_LCORE(_pcap_cached_bytes) += bytes;
>>> +	if (unlikely(RTE_PER_LCORE(_pcap_cached_bytes) >
>> ETH_PCAP_SYNC_THRESHOLD)) {
>>> +		if (!fdatasync(fileno(pcap_dump_file(dumper))))
>>> +			RTE_PER_LCORE(_pcap_cached_bytes) = 0;
>>> +	}
>>> +}
>>> +

pcap supports multiple queue, and each queue creates a new pcap dumper 
and single core/thread can be used for this multiple dumpers. In that 
case I think above per lcore variable logic doesn't work.

And instead of having a global value, what do you think to add a 
variable to 'struct pcap_tx_queue' for this purpose?

>>>    static struct queue_missed_stat*
>>>    queue_missed_stat_update(struct rte_eth_dev *dev, unsigned int qid)
>>>    {
>>> @@ -421,7 +435,7 @@ eth_pcap_tx_dumper(void *queue, struct rte_mbuf
>> **bufs, uint16_t nb_pkts)
>>>    	 * process stops and to make sure the pcap file is actually written,
>>>    	 * we flush the pcap dumper within each burst.
>>>    	 */
>>> -	pcap_dump_flush(dumper);
>>> +	pcap_dumper_data_sync(dumper, tx_bytes);
>>
>> 'pcap_dump_flush()' should be doing the same thing, to write buffer to file,
>> isn't it working?
>>
>> Can you check the return value of the 'pcap_dump_flush()' API, I wonder if it
>> keeps failing, for some reason?
>>
> 
> 'pcap_dump_flush()' returns 0 each time without error, it calls 'fflush()' to flush userspace buffers to kernel buffers, not disk. 'fdatasync()' to ensure data is written to disk.
> 

'pcap_dump_flush()' API documentation says "flushes the output buffer to 
the ``savefile,''", but as you said it uses 'fflush()' internally, so 
there is a chance that data is not written to the disk.

In this case, won't need to keep both, first flush and later 
fsync/fdatasync?

Do you observe any performance difference after change, since now 
writing to actual disk on datapath?

>>>    	dumper_q->tx_stat.pkts += num_tx;
>>>    	dumper_q->tx_stat.bytes += tx_bytes;
>>>    	dumper_q->tx_stat.err_pkts += nb_pkts - num_tx;
>
  
Yiding Zhou Aug. 29, 2022, 11:50 a.m. UTC | #4
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> Sent: Thursday, August 25, 2022 8:22 PM
> To: Zhou, YidingX <yidingx.zhou@intel.com>; dev@dpdk.org
> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; stable@dpdk.org
> Subject: Re: [PATCH] net/pcap: reduce time for stopping device
> 
> On 8/25/2022 12:17 PM, Zhou, YidingX wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> >> Sent: Thursday, August 25, 2022 6:09 PM
> >> To: Zhou, YidingX <yidingx.zhou@intel.com>; dev@dpdk.org
> >> Cc: Zhang, Qi Z <qi.z.zhang@intel.com>; stable@dpdk.org
> >> Subject: Re: [PATCH] net/pcap: reduce time for stopping device
> >>
> >> On 8/25/2022 8:20 AM, Yiding Zhou wrote:
> >>> The pcap file will be synchronized to the disk when stopping the device.
> >>> It takes a long time if the file is large that would cause the
> >>> 'detach sync request' timeout when the device is closed under
> >>> multi-process scenario.
> >>>
> >>> This commit fixes the issue by performing synchronization in Tx path
> >>>
> >>> Fixes: 4c173302c307 ("pcap: add new driver")
> >>> Cc: stable@dpdk.org
> >>>
> >>> Signed-off-by: Yiding Zhou <yidingx.zhou@intel.com>
> >>> ---
> >>>    drivers/net/pcap/pcap_ethdev.c | 18 ++++++++++++++++--
> >>>    1 file changed, 16 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/drivers/net/pcap/pcap_ethdev.c
> >>> b/drivers/net/pcap/pcap_ethdev.c index ec29fd6bc5..52eafa5674 100644
> >>> --- a/drivers/net/pcap/pcap_ethdev.c
> >>> +++ b/drivers/net/pcap/pcap_ethdev.c
> >>> @@ -3,7 +3,7 @@
> >>>     * Copyright(c) 2014 6WIND S.A.
> >>>     * All rights reserved.
> >>>     */
> >>> -
> >>> +#include <unistd.h>
> >>>    #include <time.h>
> >>>
> >>>    #include <pcap.h>
> >>> @@ -38,6 +38,8 @@
> >>>
> >>>    #define RTE_PMD_PCAP_MAX_QUEUES 16
> >>>
> >>> +#define ETH_PCAP_SYNC_THRESHOLD 0x20000000
> >>> +
> 
> I guess this is 512MB, can you please comment this.
> Is there any specific reason to select this value, or is it arbitrary?
> 
> 

512M is arbitrary, because there is no API to get the disk cache size associated with a specific file.
I will test the performance impact of different values.

> >>>    static char errbuf[PCAP_ERRBUF_SIZE];
> >>>    static struct timespec start_time;
> >>>    static uint64_t start_cycles;
> >>> @@ -47,6 +49,8 @@ static uint8_t iface_idx;
> >>>    static uint64_t timestamp_rx_dynflag;
> >>>    static int timestamp_dynfield_offset = -1;
> >>>
> >>> +RTE_DEFINE_PER_LCORE(uint64_t, _pcap_cached_bytes);
> >>> +
> >>>    struct queue_stat {
> >>>    	volatile unsigned long pkts;
> >>>    	volatile unsigned long bytes;
> >>> @@ -144,6 +148,16 @@ static struct rte_eth_link pmd_link = {
> >>>
> >>>    RTE_LOG_REGISTER_DEFAULT(eth_pcap_logtype, NOTICE);
> >>>
> >>> +static inline void
> >>> +pcap_dumper_data_sync(pcap_dumper_t *dumper, uint32_t bytes) {
> 
> 'pcap_' is the namespace for the libpcap, can you select another prefix, like
> 'eth_pcap_' as many driver functions does.
> 
> >>> +	RTE_PER_LCORE(_pcap_cached_bytes) += bytes;
> >>> +	if (unlikely(RTE_PER_LCORE(_pcap_cached_bytes) >
> >> ETH_PCAP_SYNC_THRESHOLD)) {
> >>> +		if (!fdatasync(fileno(pcap_dump_file(dumper))))
> >>> +			RTE_PER_LCORE(_pcap_cached_bytes) = 0;
> >>> +	}
> >>> +}
> >>> +
> 
> pcap supports multiple queue, and each queue creates a new pcap dumper and
> single core/thread can be used for this multiple dumpers. In that case I think
> above per lcore variable logic doesn't work.
> 
> And instead of having a global value, what do you think to add a variable to
> 'struct pcap_tx_queue' for this purpose?
> 

Thanks for the comments, I will follow this.

> >>>    static struct queue_missed_stat*
> >>>    queue_missed_stat_update(struct rte_eth_dev *dev, unsigned int qid)
> >>>    {
> >>> @@ -421,7 +435,7 @@ eth_pcap_tx_dumper(void *queue, struct rte_mbuf
> >> **bufs, uint16_t nb_pkts)
> >>>    	 * process stops and to make sure the pcap file is actually written,
> >>>    	 * we flush the pcap dumper within each burst.
> >>>    	 */
> >>> -	pcap_dump_flush(dumper);
> >>> +	pcap_dumper_data_sync(dumper, tx_bytes);
> >>
> >> 'pcap_dump_flush()' should be doing the same thing, to write buffer
> >> to file, isn't it working?
> >>
> >> Can you check the return value of the 'pcap_dump_flush()' API, I
> >> wonder if it keeps failing, for some reason?
> >>
> >
> > 'pcap_dump_flush()' returns 0 each time without error, it calls 'fflush()' to
> flush userspace buffers to kernel buffers, not disk. 'fdatasync()' to ensure data
> is written to disk.
> >
> 
> 'pcap_dump_flush()' API documentation says "flushes the output buffer to the
> ``savefile,''", but as you said it uses 'fflush()' internally, so there is a chance that
> data is not written to the disk.
> 
> In this case, won't need to keep both, first flush and later fsync/fdatasync?
> 

I draw a diagram to describe it more clearly

 fwrite                             fclose/fflush                                                   fclose/fdatasync
--------->| libc buffer  |----------------> |    disk cache in RAM          |---------------------> |disk|
              | 4096 Bytes |                         | size is determined by OS |                                |       |

When the libc buffer is full, the system will automatically sync it to the disk cache.
It is easily full as it's only 4096 B size. so there is no need to call 'fflush()' every time.
The real time consuming action is syncing the disk cache to disk.
Because the disk cache is very large, it will take a long time to sync all at one time during 'fclose()',
so need to call 'fdatasync()' periodically to amortize the time.

> Do you observe any performance difference after change, since now writing to
> actual disk on datapath?
> 

I will verify any performance difference under these 2 scenarios.

> >>>    	dumper_q->tx_stat.pkts += num_tx;
> >>>    	dumper_q->tx_stat.bytes += tx_bytes;
> >>>    	dumper_q->tx_stat.err_pkts += nb_pkts - num_tx;
> >
  
Stephen Hemminger Aug. 31, 2022, 4:42 p.m. UTC | #5
On Mon, 29 Aug 2022 11:50:29 +0000
"Zhou, YidingX" <yidingx.zhou@intel.com> wrote:

> I draw a diagram to describe it more clearly
> 
>  fwrite                             fclose/fflush                                                   fclose/fdatasync
> --------->| libc buffer  |----------------> |    disk cache in RAM          |---------------------> |disk|  
>               | 4096 Bytes |                         | size is determined by OS |                                |       |
> 
> When the libc buffer is full, the system will automatically sync it to the disk cache.
> It is easily full as it's only 4096 B size. so there is no need to call 'fflush()' every time.
> The real time consuming action is syncing the disk cache to disk.
> Because the disk cache is very large, it will take a long time to sync all at one time during 'fclose()',
> so need to call 'fdatasync()' periodically to amortize the time.

If you want to speed up this, then get rid of stdio and use a faster
API like io_uring. What you tried can help but using a better API
would make bigger gains.
  
Yiding Zhou Sept. 1, 2022, 7:40 a.m. UTC | #6
Hi, Ferruh

> > >>> The pcap file will be synchronized to the disk when stopping the device.
> > >>> It takes a long time if the file is large that would cause the
> > >>> 'detach sync request' timeout when the device is closed under
> > >>> multi-process scenario.
> > >>>
> > >>> This commit fixes the issue by performing synchronization in Tx
> > >>> path
> > >>>
> > >>> Fixes: 4c173302c307 ("pcap: add new driver")
> > >>> Cc: stable@dpdk.org
> > >>>
> > >>> Signed-off-by: Yiding Zhou <yidingx.zhou@intel.com>
> > >>> ---
> > >>>    drivers/net/pcap/pcap_ethdev.c | 18 ++++++++++++++++--
> > >>>    1 file changed, 16 insertions(+), 2 deletions(-)
> > >>>
> > >>> diff --git a/drivers/net/pcap/pcap_ethdev.c
> > >>> b/drivers/net/pcap/pcap_ethdev.c index ec29fd6bc5..52eafa5674
> > >>> 100644
> > >>> --- a/drivers/net/pcap/pcap_ethdev.c
> > >>> +++ b/drivers/net/pcap/pcap_ethdev.c
> > >>> @@ -3,7 +3,7 @@
> > >>>     * Copyright(c) 2014 6WIND S.A.
> > >>>     * All rights reserved.
> > >>>     */
> > >>> -
> > >>> +#include <unistd.h>
> > >>>    #include <time.h>
> > >>>
> > >>>    #include <pcap.h>
> > >>> @@ -38,6 +38,8 @@
> > >>>
> > >>>    #define RTE_PMD_PCAP_MAX_QUEUES 16
> > >>>
> > >>> +#define ETH_PCAP_SYNC_THRESHOLD 0x20000000
> > >>> +
> >
> > I guess this is 512MB, can you please comment this.
> > Is there any specific reason to select this value, or is it arbitrary?
> >
> >
> 
> 512M is arbitrary, because there is no API to get the disk cache size associated
> with a specific file.
> I will test the performance impact of different values.
> 
> > >>>    static char errbuf[PCAP_ERRBUF_SIZE];
> > >>>    static struct timespec start_time;
> > >>>    static uint64_t start_cycles;
> > >>> @@ -47,6 +49,8 @@ static uint8_t iface_idx;
> > >>>    static uint64_t timestamp_rx_dynflag;
> > >>>    static int timestamp_dynfield_offset = -1;
> > >>>
> > >>> +RTE_DEFINE_PER_LCORE(uint64_t, _pcap_cached_bytes);
> > >>> +
> > >>>    struct queue_stat {
> > >>>    	volatile unsigned long pkts;
> > >>>    	volatile unsigned long bytes;
> > >>> @@ -144,6 +148,16 @@ static struct rte_eth_link pmd_link = {
> > >>>
> > >>>    RTE_LOG_REGISTER_DEFAULT(eth_pcap_logtype, NOTICE);
> > >>>
> > >>> +static inline void
> > >>> +pcap_dumper_data_sync(pcap_dumper_t *dumper, uint32_t bytes) {
> >
> > 'pcap_' is the namespace for the libpcap, can you select another
> > prefix, like 'eth_pcap_' as many driver functions does.
> >
> > >>> +	RTE_PER_LCORE(_pcap_cached_bytes) += bytes;
> > >>> +	if (unlikely(RTE_PER_LCORE(_pcap_cached_bytes) >
> > >> ETH_PCAP_SYNC_THRESHOLD)) {
> > >>> +		if (!fdatasync(fileno(pcap_dump_file(dumper))))
> > >>> +			RTE_PER_LCORE(_pcap_cached_bytes) = 0;
> > >>> +	}
> > >>> +}
> > >>> +
> >
> > pcap supports multiple queue, and each queue creates a new pcap dumper
> > and single core/thread can be used for this multiple dumpers. In that
> > case I think above per lcore variable logic doesn't work.
> >
> > And instead of having a global value, what do you think to add a
> > variable to 'struct pcap_tx_queue' for this purpose?
> >
> 
> Thanks for the comments, I will follow this.
> 
> > >>>    static struct queue_missed_stat*
> > >>>    queue_missed_stat_update(struct rte_eth_dev *dev, unsigned int qid)
> > >>>    {
> > >>> @@ -421,7 +435,7 @@ eth_pcap_tx_dumper(void *queue, struct
> > >>> rte_mbuf
> > >> **bufs, uint16_t nb_pkts)
> > >>>    	 * process stops and to make sure the pcap file is actually written,
> > >>>    	 * we flush the pcap dumper within each burst.
> > >>>    	 */
> > >>> -	pcap_dump_flush(dumper);
> > >>> +	pcap_dumper_data_sync(dumper, tx_bytes);
> > >>
> > >> 'pcap_dump_flush()' should be doing the same thing, to write buffer
> > >> to file, isn't it working?
> > >>
> > >> Can you check the return value of the 'pcap_dump_flush()' API, I
> > >> wonder if it keeps failing, for some reason?
> > >>
> > >
> > > 'pcap_dump_flush()' returns 0 each time without error, it calls
> > > 'fflush()' to
> > flush userspace buffers to kernel buffers, not disk. 'fdatasync()' to
> > ensure data is written to disk.
> > >
> >
> > 'pcap_dump_flush()' API documentation says "flushes the output buffer
> > to the ``savefile,''", but as you said it uses 'fflush()' internally,
> > so there is a chance that data is not written to the disk.
> >
> > In this case, won't need to keep both, first flush and later fsync/fdatasync?
> >
> 
> I draw a diagram to describe it more clearly
> 
>  fwrite                             fclose/fflush                                                   fclose/fdatasync
> --------->| libc buffer  |----------------> |    disk cache in RAM          |--------------------->
> |disk|
>               | 4096 Bytes |                         | size is determined by OS |
> |       |
> 
> When the libc buffer is full, the system will automatically sync it to the disk
> cache.
> It is easily full as it's only 4096 B size. so there is no need to call 'fflush()' every
> time.
> The real time consuming action is syncing the disk cache to disk.
> Because the disk cache is very large, it will take a long time to sync all at one
> time during 'fclose()', so need to call 'fdatasync()' periodically to amortize the
> time.
> 
> > Do you observe any performance difference after change, since now
> > writing to actual disk on datapath?
> >
> 
> I will verify any performance difference under these 2 scenarios.
> 
I have done some tests, using 'fdatasync()' will cause the packet loss rate to increase by 15%-20%, 
It's not a good solution, need to rework. I plan to use a timer to call pcap_dump_close to fix this, 
do you have any suggestion?
thanks
> > >>>    	dumper_q->tx_stat.pkts += num_tx;
> > >>>    	dumper_q->tx_stat.bytes += tx_bytes;
> > >>>    	dumper_q->tx_stat.err_pkts += nb_pkts - num_tx;
> > >
  

Patch

diff --git a/drivers/net/pcap/pcap_ethdev.c b/drivers/net/pcap/pcap_ethdev.c
index ec29fd6bc5..52eafa5674 100644
--- a/drivers/net/pcap/pcap_ethdev.c
+++ b/drivers/net/pcap/pcap_ethdev.c
@@ -3,7 +3,7 @@ 
  * Copyright(c) 2014 6WIND S.A.
  * All rights reserved.
  */
-
+#include <unistd.h>
 #include <time.h>
 
 #include <pcap.h>
@@ -38,6 +38,8 @@ 
 
 #define RTE_PMD_PCAP_MAX_QUEUES 16
 
+#define ETH_PCAP_SYNC_THRESHOLD 0x20000000
+
 static char errbuf[PCAP_ERRBUF_SIZE];
 static struct timespec start_time;
 static uint64_t start_cycles;
@@ -47,6 +49,8 @@  static uint8_t iface_idx;
 static uint64_t timestamp_rx_dynflag;
 static int timestamp_dynfield_offset = -1;
 
+RTE_DEFINE_PER_LCORE(uint64_t, _pcap_cached_bytes);
+
 struct queue_stat {
 	volatile unsigned long pkts;
 	volatile unsigned long bytes;
@@ -144,6 +148,16 @@  static struct rte_eth_link pmd_link = {
 
 RTE_LOG_REGISTER_DEFAULT(eth_pcap_logtype, NOTICE);
 
+static inline void
+pcap_dumper_data_sync(pcap_dumper_t *dumper, uint32_t bytes)
+{
+	RTE_PER_LCORE(_pcap_cached_bytes) += bytes;
+	if (unlikely(RTE_PER_LCORE(_pcap_cached_bytes) > ETH_PCAP_SYNC_THRESHOLD)) {
+		if (!fdatasync(fileno(pcap_dump_file(dumper))))
+			RTE_PER_LCORE(_pcap_cached_bytes) = 0;
+	}
+}
+
 static struct queue_missed_stat*
 queue_missed_stat_update(struct rte_eth_dev *dev, unsigned int qid)
 {
@@ -421,7 +435,7 @@  eth_pcap_tx_dumper(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	 * process stops and to make sure the pcap file is actually written,
 	 * we flush the pcap dumper within each burst.
 	 */
-	pcap_dump_flush(dumper);
+	pcap_dumper_data_sync(dumper, tx_bytes);
 	dumper_q->tx_stat.pkts += num_tx;
 	dumper_q->tx_stat.bytes += tx_bytes;
 	dumper_q->tx_stat.err_pkts += nb_pkts - num_tx;