[dpdk-dev,v10,1/6] ethdev: add Tx preparation

Message ID 1477327917-18564-2-git-send-email-tomaszx.kulasek@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Tomasz Kulasek Oct. 24, 2016, 4:51 p.m. UTC
  Added API for `rte_eth_tx_prep`

uint16_t rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id,
	struct rte_mbuf **tx_pkts, uint16_t nb_pkts)

Added fields to the `struct rte_eth_desc_lim`:

	uint16_t nb_seg_max;
		/**< Max number of segments per whole packet. */

	uint16_t nb_mtu_seg_max;
		/**< Max number of segments per one MTU */

Added functions:

int rte_validate_tx_offload(struct rte_mbuf *m)
	to validate general requirements for tx offload set in mbuf of packet
  such a flag completness. In current implementation this function is
  called optionaly when RTE_LIBRTE_ETHDEV_DEBUG is enabled.

int rte_phdr_cksum_fix(struct rte_mbuf *m)
	to fix pseudo header checksum for TSO and non-TSO tcp/udp packets
	before hardware tx checksum offload.
	 - for non-TSO tcp/udp packets full pseudo-header checksum is
	   counted and set.
	 - for TSO the IP payload length is not included.

PERFORMANCE TESTS
-----------------

This feature was tested with modified csum engine from test-pmd.

The packet checksum preparation was moved from application to Tx
preparation step placed before burst.

We may expect some overhead costs caused by:
1) using additional callback before burst,
2) rescanning burst,
3) additional condition checking (packet validation),
4) worse optimization (e.g. packet data access, etc.)

We tested it using ixgbe Tx preparation implementation with some parts
disabled to have comparable information about the impact of diferent
parts of implementation.

IMPACT:

1) For unimplemented Tx preparation callback the performance impact is
   negligible,
2) For packet condition check without checksum modifications (nb_segs,
   available offloads, etc.) is 14626628/14252168 (~2.62% drop),
3) Full support in ixgbe driver (point 2 + packet checksum
   initialization) is 14060924/13588094 (~3.48% drop)

Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 config/common_base            |    1 +
 lib/librte_ether/rte_ethdev.h |   96 +++++++++++++++++++++++++++++++++++++++++
 lib/librte_mbuf/rte_mbuf.h    |   64 +++++++++++++++++++++++++++
 lib/librte_net/rte_net.h      |   85 ++++++++++++++++++++++++++++++++++++
 4 files changed, 246 insertions(+)
  

Comments

Olivier Matz Oct. 25, 2016, 2:41 p.m. UTC | #1
Hi Tomasz,

On 10/24/2016 06:51 PM, Tomasz Kulasek wrote:
> Added API for `rte_eth_tx_prep`
> 
> [...]
> --- a/lib/librte_ether/rte_ethdev.h
> +++ b/lib/librte_ether/rte_ethdev.h
> @@ -182,6 +182,7 @@ extern "C" {
>  #include <rte_pci.h>
>  #include <rte_dev.h>
>  #include <rte_devargs.h>
> +#include <rte_errno.h>
>  #include "rte_ether.h"
>  #include "rte_eth_ctrl.h"
>  #include "rte_dev_info.h"
> @@ -699,6 +700,8 @@ struct rte_eth_desc_lim {
>  	uint16_t nb_max;   /**< Max allowed number of descriptors. */
>  	uint16_t nb_min;   /**< Min allowed number of descriptors. */
>  	uint16_t nb_align; /**< Number of descriptors should be aligned to. */
> +	uint16_t nb_seg_max;     /**< Max number of segments per whole packet. */
> +	uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */

Sorry if it was not clear in my previous review, but I think this should
be better explained here. You said that the "limitation of number
of segments may differ depend of TSO/non TSO".

As an application developer, I still have some difficulties to
clearly understand what does that mean. Is it the maximum number
of mbuf-segments that contain payload for one tcp-segment sent by
the device?

In that case, it looks quite difficult to verify that in an application.
It looks that this field is not used by validate_offload(), so how
should it be used by an application?


>  };
>  
>  /**
> @@ -1188,6 +1191,11 @@ typedef uint16_t (*eth_tx_burst_t)(void *txq,
>  				   uint16_t nb_pkts);
>  /**< @internal Send output packets on a transmit queue of an Ethernet device. */
>  
> +typedef uint16_t (*eth_tx_prep_t)(void *txq,
> +				   struct rte_mbuf **tx_pkts,
> +				   uint16_t nb_pkts);
> +/**< @internal Prepare output packets on a transmit queue of an Ethernet device. */
> +
>  typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev,
>  			       struct rte_eth_fc_conf *fc_conf);
>  /**< @internal Get current flow control parameter on an Ethernet device */
> @@ -1622,6 +1630,7 @@ struct rte_eth_rxtx_callback {
>  struct rte_eth_dev {
>  	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
>  	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
> +	eth_tx_prep_t tx_pkt_prep; /**< Pointer to PMD transmit prepare function. */
>  	struct rte_eth_dev_data *data;  /**< Pointer to device data */
>  	const struct eth_driver *driver;/**< Driver for this device */
>  	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
> @@ -2816,6 +2825,93 @@ rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
>  	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
>  }
>  
> +/**
> + * Process a burst of output packets on a transmit queue of an Ethernet device.
> + *
> + * The rte_eth_tx_prep() function is invoked to prepare output packets to be
> + * transmitted on the output queue *queue_id* of the Ethernet device designated
> + * by its *port_id*.
> + * The *nb_pkts* parameter is the number of packets to be prepared which are
> + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
> + * allocated from a pool created with rte_pktmbuf_pool_create().
> + * For each packet to send, the rte_eth_tx_prep() function performs
> + * the following operations:
> + *
> + * - Check if packet meets devices requirements for tx offloads.
> + *
> + * - Check limitations about number of segments.
> + *
> + * - Check additional requirements when debug is enabled.
> + *
> + * - Update and/or reset required checksums when tx offload is set for packet.
> + *
> + * The rte_eth_tx_prep() function returns the number of packets ready to be
> + * sent. A return value equal to *nb_pkts* means that all packets are valid and
> + * ready to be sent.
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + *   The value must be a valid port id.
> + * @param queue_id
> + *   The index of the transmit queue through which output packets must be
> + *   sent.
> + *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
> + *   to rte_eth_dev_configure().
> + * @param tx_pkts
> + *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
> + *   which contain the output packets.
> + * @param nb_pkts
> + *   The maximum number of packets to process.
> + * @return
> + *   The number of packets correct and ready to be sent. The return value can be
> + *   less than the value of the *tx_pkts* parameter when some packet doesn't
> + *   meet devices requirements with rte_errno set appropriately.
> + */

Inserting here the previous comment:

>> Can we add the constraint that invalid packets are left untouched?
>>
>> I think most of the time there will be a software fallback in that case,
>> so it would be good to ensure that this function does not change the flags
>> or the packet data.
> 
> In current implementation, if packet is invalid, its data is never modified. Only checks are done. The only exception is when checksum needs to be updated or initialized, but it's done after packet validation.
> If we want to use/restore packet in application it didn't should be changed in any way for invalid packets.
> 

I think this should be explicitly said in the API comment that
valid packets may be modified (*), but invalid packets (whose index
is >= than the return value) are left untouched.

(*) we still need to discuss that point, see below.

Another comment was made:

>> Another thing that could be interesting for the caller is to know the
>> reason of the failure. Maybe the different errno types could be detailed
>> here. For instance:
>> - EINVAL: offload flags are not correctly set (i.e. would fail whatever
>>   the hardware)
>> - ENOTSUP: the offload feature is not supported by the hardware
>> - ...
>>
> 
> Ok.

Don't you feel it could go in the API comment too?


> [...]
>
> +/**
> + * Fix pseudo header checksum
> + *
> + * This function fixes pseudo header checksum for TSO and non-TSO tcp/udp in
> + * provided mbufs packet data.
> + *
> + * - for non-TSO tcp/udp packets full pseudo-header checksum is counted and set
> + *   in packet data,
> + * - for TSO the IP payload length is not included in pseudo header.
> + *
> + * This function expects that used headers are in the first data segment of
> + * mbuf, and are not fragmented.

There is another requirement about the cloning and reference count.
Sorry I did not answer to Konstantin, but I still think this could be
an issue.

For instance, I think that the zero-copy mode of vhost application
references the packet sent by the guest and send the data. The payload
should not be modified because it is in guest memory (we don't know,
maybe the guest also cloned it for its own purpose).

It means that the tx_prep() API must not be used with clones, i.e
the headers must not reside in a segment whose RTE_MBUF_INDIRECT(seg)
or rte_mbuf_refcnt_read(seg) > 1.

- if we really want this API in 16.11, it should be clearly explained
  in the API comment that it does not work with shared segments

- for next versions, we have to take a decision whether it should be
  supported or not. In my opinion, cloned packets are useful and should
  be supported properly by all dpdk APIs.


Thanks,
Olivier
  
Tomasz Kulasek Oct. 25, 2016, 5:28 p.m. UTC | #2
> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Tuesday, October 25, 2016 16:42
> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>; dev@dpdk.org
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Subject: Re: [PATCH v10 1/6] ethdev: add Tx preparation
> 
> Hi Tomasz,
> 
> On 10/24/2016 06:51 PM, Tomasz Kulasek wrote:
> > Added API for `rte_eth_tx_prep`
> >
> > [...]
> > --- a/lib/librte_ether/rte_ethdev.h
> > +++ b/lib/librte_ether/rte_ethdev.h
> > @@ -182,6 +182,7 @@ extern "C" {
> >  #include <rte_pci.h>
> >  #include <rte_dev.h>
> >  #include <rte_devargs.h>
> > +#include <rte_errno.h>
> >  #include "rte_ether.h"
> >  #include "rte_eth_ctrl.h"
> >  #include "rte_dev_info.h"
> > @@ -699,6 +700,8 @@ struct rte_eth_desc_lim {
> >  	uint16_t nb_max;   /**< Max allowed number of descriptors. */
> >  	uint16_t nb_min;   /**< Min allowed number of descriptors. */
> >  	uint16_t nb_align; /**< Number of descriptors should be aligned to.
> > */
> > +	uint16_t nb_seg_max;     /**< Max number of segments per whole
> packet. */
> > +	uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */
> 
> Sorry if it was not clear in my previous review, but I think this should
> be better explained here. You said that the "limitation of number of
> segments may differ depend of TSO/non TSO".
> 
> As an application developer, I still have some difficulties to clearly
> understand what does that mean. Is it the maximum number of mbuf-segments
> that contain payload for one tcp-segment sent by the device?
> 
> In that case, it looks quite difficult to verify that in an application.
> It looks that this field is not used by validate_offload(), so how should
> it be used by an application?
> 

E.g. for i40e (from xl710 datasheet):

  8.4.1 Transmit Packet in System Memory
   ...
  A few rules related to the transmit packet in host memory are:
   ...
   - A single transmit packet may span up to 8 buffers (up to 8 data descriptors
     per packet including both the header and payload buffers).
   - The total number of data descriptors for the whole TSO (explained later on in
     this chapter) is unlimited as long as each segment within the TSO obeys
     the previous rule (up to 8 data descriptors per segment for both the TSO
     header and the segment payload buffers).
  ...

+#define I40E_TX_MAX_SEG     UINT8_MAX
+#define I40E_TX_MAX_MTU_SEG 8

For ixgbe driver there's one limitation, both for TSO and non-TSO and it
is always 40-WTHRESH.

Such a differences causes that these values should checked in the tx_prep
callback (when it is required) for a specific device (e.g. i40e, ixgbe).

If other device have not such a limitations, or are higher than DPDK can
handle (nb_segs is uint8_t), then for performance reason it doesn't need
to be checked.

Validate_offload() is for general check, the drivers specific implementation
Is the part in the callback function e.g. in 

  uint16_t ixgbe_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
		uint16_t nb_pkts)

for ixgbe.

Values in the "struct rte_eth_desc_lim" provides an information to the
application, which should to let you to not create malicious packets or
at least to limit their number.

Using Tx preparation API, these checks are made transparently for the
application and when nb_segs are out of limits, tx_prep function fails.

> 
> >  };
> >
> >  /**
> > @@ -1188,6 +1191,11 @@ typedef uint16_t (*eth_tx_burst_t)(void *txq,
> >  				   uint16_t nb_pkts);
> >  /**< @internal Send output packets on a transmit queue of an Ethernet
> > device. */
> >
> > +typedef uint16_t (*eth_tx_prep_t)(void *txq,
> > +				   struct rte_mbuf **tx_pkts,
> > +				   uint16_t nb_pkts);
> > +/**< @internal Prepare output packets on a transmit queue of an
> > +Ethernet device. */
> > +
> >  typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev,
> >  			       struct rte_eth_fc_conf *fc_conf);  /**< @internal
> Get
> > current flow control parameter on an Ethernet device */ @@ -1622,6
> > +1630,7 @@ struct rte_eth_rxtx_callback {  struct rte_eth_dev {
> >  	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function.
> */
> >  	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function.
> > */
> > +	eth_tx_prep_t tx_pkt_prep; /**< Pointer to PMD transmit prepare
> > +function. */
> >  	struct rte_eth_dev_data *data;  /**< Pointer to device data */
> >  	const struct eth_driver *driver;/**< Driver for this device */
> >  	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
> > @@ -2816,6 +2825,93 @@ rte_eth_tx_burst(uint8_t port_id, uint16_t
> queue_id,
> >  	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts,
> > nb_pkts);  }
> >
> > +/**
> > + * Process a burst of output packets on a transmit queue of an Ethernet
> device.
> > + *
> > + * The rte_eth_tx_prep() function is invoked to prepare output
> > +packets to be
> > + * transmitted on the output queue *queue_id* of the Ethernet device
> > +designated
> > + * by its *port_id*.
> > + * The *nb_pkts* parameter is the number of packets to be prepared
> > +which are
> > + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of
> > +them
> > + * allocated from a pool created with rte_pktmbuf_pool_create().
> > + * For each packet to send, the rte_eth_tx_prep() function performs
> > + * the following operations:
> > + *
> > + * - Check if packet meets devices requirements for tx offloads.
> > + *
> > + * - Check limitations about number of segments.
> > + *
> > + * - Check additional requirements when debug is enabled.
> > + *
> > + * - Update and/or reset required checksums when tx offload is set for
> packet.
> > + *
> > + * The rte_eth_tx_prep() function returns the number of packets ready
> > +to be
> > + * sent. A return value equal to *nb_pkts* means that all packets are
> > +valid and
> > + * ready to be sent.
> > + *
> > + * @param port_id
> > + *   The port identifier of the Ethernet device.
> > + *   The value must be a valid port id.
> > + * @param queue_id
> > + *   The index of the transmit queue through which output packets must
> be
> > + *   sent.
> > + *   The value must be in the range [0, nb_tx_queue - 1] previously
> supplied
> > + *   to rte_eth_dev_configure().
> > + * @param tx_pkts
> > + *   The address of an array of *nb_pkts* pointers to *rte_mbuf*
> structures
> > + *   which contain the output packets.
> > + * @param nb_pkts
> > + *   The maximum number of packets to process.
> > + * @return
> > + *   The number of packets correct and ready to be sent. The return
> value can be
> > + *   less than the value of the *tx_pkts* parameter when some packet
> doesn't
> > + *   meet devices requirements with rte_errno set appropriately.
> > + */
> 
> Inserting here the previous comment:
> 
> >> Can we add the constraint that invalid packets are left untouched?
> >>
> >> I think most of the time there will be a software fallback in that
> >> case, so it would be good to ensure that this function does not
> >> change the flags or the packet data.
> >
> > In current implementation, if packet is invalid, its data is never
> modified. Only checks are done. The only exception is when checksum needs
> to be updated or initialized, but it's done after packet validation.
> > If we want to use/restore packet in application it didn't should be
> changed in any way for invalid packets.
> >
> 
> I think this should be explicitly said in the API comment that valid
> packets may be modified (*), but invalid packets (whose index is >= than
> the return value) are left untouched.
> 
> (*) we still need to discuss that point, see below.
> 
> Another comment was made:
> 
> >> Another thing that could be interesting for the caller is to know the
> >> reason of the failure. Maybe the different errno types could be
> >> detailed here. For instance:
> >> - EINVAL: offload flags are not correctly set (i.e. would fail whatever
> >>   the hardware)
> >> - ENOTSUP: the offload feature is not supported by the hardware
> >> - ...
> >>
> >
> > Ok.
> 
> Don't you feel it could go in the API comment too?
> 

Ok, I will modify comments.

> 
> > [...]
> >
> > +/**
> > + * Fix pseudo header checksum
> > + *
> > + * This function fixes pseudo header checksum for TSO and non-TSO
> > +tcp/udp in
> > + * provided mbufs packet data.
> > + *
> > + * - for non-TSO tcp/udp packets full pseudo-header checksum is counted
> and set
> > + *   in packet data,
> > + * - for TSO the IP payload length is not included in pseudo header.
> > + *
> > + * This function expects that used headers are in the first data
> > +segment of
> > + * mbuf, and are not fragmented.
> 
> There is another requirement about the cloning and reference count.
> Sorry I did not answer to Konstantin, but I still think this could be an
> issue.
> 
> For instance, I think that the zero-copy mode of vhost application
> references the packet sent by the guest and send the data. The payload
> should not be modified because it is in guest memory (we don't know, maybe
> the guest also cloned it for its own purpose).
> 
> It means that the tx_prep() API must not be used with clones, i.e the
> headers must not reside in a segment whose RTE_MBUF_INDIRECT(seg) or
> rte_mbuf_refcnt_read(seg) > 1.
> 
> - if we really want this API in 16.11, it should be clearly explained
>   in the API comment that it does not work with shared segments
> 

Ok, I will.

> - for next versions, we have to take a decision whether it should be
>   supported or not. In my opinion, cloned packets are useful and should
>   be supported properly by all dpdk APIs.
> 
> 
> Thanks,
> Olivier

Thanks,
Tomasz
  

Patch

diff --git a/config/common_base b/config/common_base
index c7fd3db..619284b 100644
--- a/config/common_base
+++ b/config/common_base
@@ -120,6 +120,7 @@  CONFIG_RTE_MAX_QUEUES_PER_PORT=1024
 CONFIG_RTE_LIBRTE_IEEE1588=n
 CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16
 CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y
+CONFIG_RTE_ETHDEV_TX_PREP=y
 
 #
 # Support NIC bypass logic
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 38641e8..c4a8ccd 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -182,6 +182,7 @@  extern "C" {
 #include <rte_pci.h>
 #include <rte_dev.h>
 #include <rte_devargs.h>
+#include <rte_errno.h>
 #include "rte_ether.h"
 #include "rte_eth_ctrl.h"
 #include "rte_dev_info.h"
@@ -699,6 +700,8 @@  struct rte_eth_desc_lim {
 	uint16_t nb_max;   /**< Max allowed number of descriptors. */
 	uint16_t nb_min;   /**< Min allowed number of descriptors. */
 	uint16_t nb_align; /**< Number of descriptors should be aligned to. */
+	uint16_t nb_seg_max;     /**< Max number of segments per whole packet. */
+	uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */
 };
 
 /**
@@ -1188,6 +1191,11 @@  typedef uint16_t (*eth_tx_burst_t)(void *txq,
 				   uint16_t nb_pkts);
 /**< @internal Send output packets on a transmit queue of an Ethernet device. */
 
+typedef uint16_t (*eth_tx_prep_t)(void *txq,
+				   struct rte_mbuf **tx_pkts,
+				   uint16_t nb_pkts);
+/**< @internal Prepare output packets on a transmit queue of an Ethernet device. */
+
 typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev,
 			       struct rte_eth_fc_conf *fc_conf);
 /**< @internal Get current flow control parameter on an Ethernet device */
@@ -1622,6 +1630,7 @@  struct rte_eth_rxtx_callback {
 struct rte_eth_dev {
 	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
 	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
+	eth_tx_prep_t tx_pkt_prep; /**< Pointer to PMD transmit prepare function. */
 	struct rte_eth_dev_data *data;  /**< Pointer to device data */
 	const struct eth_driver *driver;/**< Driver for this device */
 	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
@@ -2816,6 +2825,93 @@  rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
 	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
 }
 
+/**
+ * Process a burst of output packets on a transmit queue of an Ethernet device.
+ *
+ * The rte_eth_tx_prep() function is invoked to prepare output packets to be
+ * transmitted on the output queue *queue_id* of the Ethernet device designated
+ * by its *port_id*.
+ * The *nb_pkts* parameter is the number of packets to be prepared which are
+ * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
+ * allocated from a pool created with rte_pktmbuf_pool_create().
+ * For each packet to send, the rte_eth_tx_prep() function performs
+ * the following operations:
+ *
+ * - Check if packet meets devices requirements for tx offloads.
+ *
+ * - Check limitations about number of segments.
+ *
+ * - Check additional requirements when debug is enabled.
+ *
+ * - Update and/or reset required checksums when tx offload is set for packet.
+ *
+ * The rte_eth_tx_prep() function returns the number of packets ready to be
+ * sent. A return value equal to *nb_pkts* means that all packets are valid and
+ * ready to be sent.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *   The value must be a valid port id.
+ * @param queue_id
+ *   The index of the transmit queue through which output packets must be
+ *   sent.
+ *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param tx_pkts
+ *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
+ *   which contain the output packets.
+ * @param nb_pkts
+ *   The maximum number of packets to process.
+ * @return
+ *   The number of packets correct and ready to be sent. The return value can be
+ *   less than the value of the *tx_pkts* parameter when some packet doesn't
+ *   meet devices requirements with rte_errno set appropriately.
+ */
+
+#ifdef RTE_ETHDEV_TX_PREP
+
+static inline uint16_t
+rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	struct rte_eth_dev *dev;
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		RTE_PMD_DEBUG_TRACE("Invalid TX port_id=%d\n", port_id);
+		rte_errno = -EINVAL;
+		return 0;
+	}
+#endif
+
+	dev = &rte_eth_devices[port_id];
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+	if (queue_id >= dev->data->nb_tx_queues) {
+		RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id);
+		rte_errno = -EINVAL;
+		return 0;
+	}
+#endif
+
+	if (!dev->tx_pkt_prep)
+		return nb_pkts;
+
+	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id],
+			tx_pkts, nb_pkts);
+}
+
+#else
+
+static inline uint16_t
+rte_eth_tx_prep(__rte_unused uint8_t port_id, __rte_unused uint16_t queue_id,
+		__rte_unused struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	return nb_pkts;
+}
+
+#endif
+
 typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
 		void *userdata);
 
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 109e666..ff9e749 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -283,6 +283,19 @@  extern "C" {
  */
 #define PKT_TX_OUTER_IPV6    (1ULL << 60)
 
+/**
+ * Bit Mask of all supported packet Tx offload features flags, which can be set
+ * for packet.
+ */
+#define PKT_TX_OFFLOAD_MASK (    \
+		PKT_TX_IP_CKSUM |        \
+		PKT_TX_L4_MASK |         \
+		PKT_TX_OUTER_IP_CKSUM |  \
+		PKT_TX_TCP_SEG |         \
+		PKT_TX_QINQ_PKT |        \
+		PKT_TX_VLAN_PKT |        \
+		PKT_TX_TUNNEL_MASK)
+
 #define __RESERVED           (1ULL << 61) /**< reserved for future mbuf use */
 
 #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
@@ -1647,6 +1660,57 @@  static inline int rte_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *tail
 }
 
 /**
+ * Validate general requirements for tx offload in mbuf.
+ *
+ * This function checks correctness and completeness of Tx offload settings.
+ *
+ * @param m
+ *   The packet mbuf to be validated.
+ * @return
+ *   0 if packet is valid
+ */
+static inline int
+rte_validate_tx_offload(const struct rte_mbuf *m)
+{
+	uint64_t ol_flags = m->ol_flags;
+	uint64_t inner_l3_offset = m->l2_len;
+
+	/* Does packet set any of available offloads? */
+	if (!(ol_flags & PKT_TX_OFFLOAD_MASK))
+		return 0;
+
+	if (ol_flags & PKT_TX_OUTER_IP_CKSUM)
+		inner_l3_offset += m->outer_l2_len + m->outer_l3_len;
+
+	/* Headers are fragmented */
+	if (rte_pktmbuf_data_len(m) < inner_l3_offset + m->l3_len + m->l4_len)
+		return -ENOTSUP;
+
+	/* IP checksum can be counted only for IPv4 packet */
+	if ((ol_flags & PKT_TX_IP_CKSUM) && (ol_flags & PKT_TX_IPV6))
+		return -EINVAL;
+
+	/* IP type not set when required */
+	if (ol_flags & (PKT_TX_L4_MASK | PKT_TX_TCP_SEG))
+		if (!(ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)))
+			return -EINVAL;
+
+	/* Check requirements for TSO packet */
+	if (ol_flags & PKT_TX_TCP_SEG)
+		if ((m->tso_segsz == 0) ||
+				((ol_flags & PKT_TX_IPV4) &&
+				!(ol_flags & PKT_TX_IP_CKSUM)))
+			return -EINVAL;
+
+	/* PKT_TX_OUTER_IP_CKSUM set for non outer IPv4 packet. */
+	if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) &&
+			!(ol_flags & PKT_TX_OUTER_IPV4))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
  * Dump an mbuf structure to a file.
  *
  * Dump all fields for the given packet mbuf and all its associated
diff --git a/lib/librte_net/rte_net.h b/lib/librte_net/rte_net.h
index d4156ae..90af335 100644
--- a/lib/librte_net/rte_net.h
+++ b/lib/librte_net/rte_net.h
@@ -38,6 +38,11 @@ 
 extern "C" {
 #endif
 
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_tcp.h>
+#include <rte_sctp.h>
+
 /**
  * Structure containing header lengths associated to a packet, filled
  * by rte_net_get_ptype().
@@ -86,6 +91,86 @@  struct rte_net_hdr_lens {
 uint32_t rte_net_get_ptype(const struct rte_mbuf *m,
 	struct rte_net_hdr_lens *hdr_lens, uint32_t layers);
 
+/**
+ * Fix pseudo header checksum
+ *
+ * This function fixes pseudo header checksum for TSO and non-TSO tcp/udp in
+ * provided mbufs packet data.
+ *
+ * - for non-TSO tcp/udp packets full pseudo-header checksum is counted and set
+ *   in packet data,
+ * - for TSO the IP payload length is not included in pseudo header.
+ *
+ * This function expects that used headers are in the first data segment of
+ * mbuf, and are not fragmented.
+ *
+ * @param m
+ *   The packet mbuf to be validated.
+ * @return
+ *   0 if checksum is initialized properly
+ */
+static inline int
+rte_phdr_cksum_fix(struct rte_mbuf *m)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	struct tcp_hdr *tcp_hdr;
+	struct udp_hdr *udp_hdr;
+	uint64_t ol_flags = m->ol_flags;
+	uint64_t inner_l3_offset = m->l2_len;
+
+	if (ol_flags & PKT_TX_OUTER_IP_CKSUM)
+		inner_l3_offset += m->outer_l2_len + m->outer_l3_len;
+
+	if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
+		if (ol_flags & PKT_TX_IPV4) {
+			ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
+					inner_l3_offset);
+
+			if (ol_flags & PKT_TX_IP_CKSUM)
+				ipv4_hdr->hdr_checksum = 0;
+
+			udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
+					m->l3_len);
+			udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
+					ol_flags);
+		} else {
+			ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *,
+					inner_l3_offset);
+			/* non-TSO udp */
+			udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *,
+					inner_l3_offset + m->l3_len);
+			udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
+					ol_flags);
+		}
+	} else if ((ol_flags & PKT_TX_TCP_CKSUM) ||
+			(ol_flags & PKT_TX_TCP_SEG)) {
+		if (ol_flags & PKT_TX_IPV4) {
+			ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
+					inner_l3_offset);
+
+			if (ol_flags & PKT_TX_IP_CKSUM)
+				ipv4_hdr->hdr_checksum = 0;
+
+			/* non-TSO tcp or TSO */
+			tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
+					m->l3_len);
+			tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
+					ol_flags);
+		} else {
+			ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *,
+					inner_l3_offset);
+			/* non-TSO tcp or TSO */
+			tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *,
+					inner_l3_offset + m->l3_len);
+			tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
+					ol_flags);
+		}
+	}
+
+	return 0;
+}
+
 #ifdef __cplusplus
 }
 #endif