[dpdk-dev,2/5] cxgbe: add cxgbe poll mode driver.

Message ID 25f06823ef87d05bf57d996df387478e7fbc966f.1432300701.git.rahul.lakkireddy@chelsio.com (mailing list archive)
State Changes Requested, archived
Headers

Commit Message

Rahul Lakkireddy May 22, 2015, 1:24 p.m. UTC
  Adds cxgbe poll mode driver for DPDK under lib/librte_pmd_cxgbe directory. This
patch:

1. Adds the Makefile to compile cxgbe pmd.
2. Registers and initializes the cxgbe pmd driver.
3. Implements the necessary eth_dev_ops.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
---
 lib/librte_pmd_cxgbe/Makefile       |   74 ++
 lib/librte_pmd_cxgbe/cxgbe.h        |   60 +
 lib/librte_pmd_cxgbe/cxgbe_compat.h |  290 +++++
 lib/librte_pmd_cxgbe/cxgbe_ethdev.c |  796 +++++++++++++
 lib/librte_pmd_cxgbe/cxgbe_main.c   | 1219 +++++++++++++++++++
 lib/librte_pmd_cxgbe/sge.c          | 2250 +++++++++++++++++++++++++++++++++++
 6 files changed, 4689 insertions(+), 0 deletions(-)
 create mode 100644 lib/librte_pmd_cxgbe/Makefile
 create mode 100644 lib/librte_pmd_cxgbe/cxgbe.h
 create mode 100644 lib/librte_pmd_cxgbe/cxgbe_compat.h
 create mode 100644 lib/librte_pmd_cxgbe/cxgbe_ethdev.c
 create mode 100644 lib/librte_pmd_cxgbe/cxgbe_main.c
 create mode 100644 lib/librte_pmd_cxgbe/sge.c
  

Comments

Stephen Hemminger May 22, 2015, 4:42 p.m. UTC | #1
On Fri, 22 May 2015 18:54:20 +0530
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:

> +#define pr_err(y, args...) dev_err(0, y, ##args)
> +#define pr_warn(y, args...) dev_warn(0, y, ##args)
> +#define pr_info(y, args...) dev_info(0, y, ##args)
> +#define BUG() pr_err("BUG at %s:%d", __func__, __LINE__)
> +
> +#define ASSERT(x) do {\
> +	if (!(x)) \
> +		rte_panic("CXGBE: x"); \
> +} while (0)
> +#define BUG_ON(x) ASSERT(!(x))
> +
> +#ifndef WARN_ON
> +#define WARN_ON(x) do { \
> +	int ret = !!(x); \
> +	if (unlikely(ret)) \
> +		pr_warn("WARN_ON: \"" #x "\" at %s:%d\n", __func__, __LINE__); \
> +} while (0)
> +#endif
> +
> +#define __iomem
> +
> +#ifndef BIT
> +#define BIT(n) (1 << (n))
> +#endif
> +
> +#define L1_CACHE_SHIFT  6
> +#define L1_CACHE_BYTES  BIT(L1_CACHE_SHIFT)
> +
> +#define PAGE_SHIFT  12
> +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
> +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
> +
> +#define VLAN_HLEN 4
> +
> +#define rmb()     rte_rmb() /* dpdk rte provided rmb */
> +#define wmb()     rte_wmb() /* dpdk rte provided wmb */
> +
> +typedef uint8_t   u8;
> +typedef int8_t    s8;
> +typedef uint16_t  u16;
> +typedef uint32_t  u32;
> +typedef int32_t   s32;
> +typedef uint64_t  u64;
> +typedef int       bool;
> +typedef uint64_t  dma_addr_t;
> +
> +#ifndef __le16
> +#define __le16	uint16_t
> +#endif
> +#ifndef __le32
> +#define __le32	uint32_t
> +#endif
> +#ifndef __le64
> +#define __le64	uint64_t
> +#endif
> +#ifndef __be16
> +#define __be16	uint16_t
> +#endif
> +#ifndef __be32
> +#define __be32	uint32_t
> +#endif
> +#ifndef __be64
> +#define __be64	uint64_t
> +#endif
> +#ifndef __u8
> +#define __u8	uint8_t
> +#endif
> +#ifndef __u16
> +#define __u16	uint16_t
> +#endif
> +#ifndef __u32
> +#define __u32	uint32_t
> +#endif
> +#ifndef __u64
> +#define __u64	uint64_t
> +#endif
> +
> +#define FALSE	0
> +#define TRUE	1
> +#define false	0
> +#define true	1
> +
> +#define min(a, b) RTE_MIN(a, b)
> +#define max(a, b) RTE_MAX(a, b)

This is not Linux kernel.
Please don't create wrappers for all the stuff in Linux to port your driver.
  
Stephen Hemminger May 22, 2015, 4:43 p.m. UTC | #2
On Fri, 22 May 2015 18:54:20 +0530
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:

> +/**
> + * fls - find last (most-significant) bit set
> + * @x: the word to search
> + *
> + * This is defined the same way as ffs.
> + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
> + */
> +static inline int fls(int x)

Isn't there a Gcc intrinsic already for this?
  
Stephen Hemminger May 22, 2015, 4:46 p.m. UTC | #3
On Fri, 22 May 2015 18:54:20 +0530
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:

> +static int cxgbe_dev_link_update(struct rte_eth_dev *eth_dev,
> +				 __rte_unused int wait_to_complete)
> +{
> +	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
> +	struct adapter *adapter = pi->adapter;
> +	struct sge *s = &adapter->sge;
> +	unsigned int work_done, budget = 4;
> +	int ret;
> +
> +	cxgbe_poll(&s->fw_evtq, NULL, budget, &work_done);
> +	ret = pi->link_cfg.link_ok;
> +	eth_dev->data->dev_link.link_status = pi->link_cfg.link_ok;
> +	eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
> +	eth_dev->data->dev_link.link_speed = pi->link_cfg.speed;
> +	return ret;

It is well not documented but link update is supposed to return 0
on link status changed and -1 if not changed.
  
Rahul Lakkireddy May 23, 2015, 5:53 a.m. UTC | #4
On Fri, May 22, 2015 at 09:46:38 -0700, Stephen Hemminger wrote:
> On Fri, 22 May 2015 18:54:20 +0530
> Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> 
> > +static int cxgbe_dev_link_update(struct rte_eth_dev *eth_dev,
> > +				 __rte_unused int wait_to_complete)
> > +{
> > +	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
> > +	struct adapter *adapter = pi->adapter;
> > +	struct sge *s = &adapter->sge;
> > +	unsigned int work_done, budget = 4;
> > +	int ret;
> > +
> > +	cxgbe_poll(&s->fw_evtq, NULL, budget, &work_done);
> > +	ret = pi->link_cfg.link_ok;
> > +	eth_dev->data->dev_link.link_status = pi->link_cfg.link_ok;
> > +	eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
> > +	eth_dev->data->dev_link.link_speed = pi->link_cfg.speed;
> > +	return ret;
> 
> It is well not documented but link update is supposed to return 0
> on link status changed and -1 if not changed.

Ok. We will correct the return value in v2 posting.
We need to rebase anyway since PMDs now seem to have moved to
drivers/net directory.


Thanks,
Rahul.
  
Rahul Lakkireddy May 23, 2015, 5:56 a.m. UTC | #5
On Fri, May 22, 2015 at 09:43:20 -0700, Stephen Hemminger wrote:
> On Fri, 22 May 2015 18:54:20 +0530
> Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> 
> > +/**
> > + * fls - find last (most-significant) bit set
> > + * @x: the word to search
> > + *
> > + * This is defined the same way as ffs.
> > + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
> > + */
> > +static inline int fls(int x)
> 
> Isn't there a Gcc intrinsic already for this?

I guess you are referring to __builtin_clz ?
I will do that change in v2.


Thanks,
Rahul.
  
Rahul Lakkireddy May 23, 2015, 5:57 a.m. UTC | #6
On Fri, May 22, 2015 at 09:42:50 -0700, Stephen Hemminger wrote:
> On Fri, 22 May 2015 18:54:20 +0530
> Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> 
> > +#define pr_err(y, args...) dev_err(0, y, ##args)
> > +#define pr_warn(y, args...) dev_warn(0, y, ##args)
> > +#define pr_info(y, args...) dev_info(0, y, ##args)
> > +#define BUG() pr_err("BUG at %s:%d", __func__, __LINE__)
> > +
> > +#define ASSERT(x) do {\
> > +	if (!(x)) \
> > +		rte_panic("CXGBE: x"); \
> > +} while (0)
> > +#define BUG_ON(x) ASSERT(!(x))
> > +
> > +#ifndef WARN_ON
> > +#define WARN_ON(x) do { \
> > +	int ret = !!(x); \
> > +	if (unlikely(ret)) \
> > +		pr_warn("WARN_ON: \"" #x "\" at %s:%d\n", __func__, __LINE__); \
> > +} while (0)
> > +#endif
> > +
> > +#define __iomem
> > +
> > +#ifndef BIT
> > +#define BIT(n) (1 << (n))
> > +#endif
> > +
> > +#define L1_CACHE_SHIFT  6
> > +#define L1_CACHE_BYTES  BIT(L1_CACHE_SHIFT)
> > +
> > +#define PAGE_SHIFT  12
> > +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
> > +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
> > +
> > +#define VLAN_HLEN 4
> > +
> > +#define rmb()     rte_rmb() /* dpdk rte provided rmb */
> > +#define wmb()     rte_wmb() /* dpdk rte provided wmb */
> > +
> > +typedef uint8_t   u8;
> > +typedef int8_t    s8;
> > +typedef uint16_t  u16;
> > +typedef uint32_t  u32;
> > +typedef int32_t   s32;
> > +typedef uint64_t  u64;
> > +typedef int       bool;
> > +typedef uint64_t  dma_addr_t;
> > +
> > +#ifndef __le16
> > +#define __le16	uint16_t
> > +#endif
> > +#ifndef __le32
> > +#define __le32	uint32_t
> > +#endif
> > +#ifndef __le64
> > +#define __le64	uint64_t
> > +#endif
> > +#ifndef __be16
> > +#define __be16	uint16_t
> > +#endif
> > +#ifndef __be32
> > +#define __be32	uint32_t
> > +#endif
> > +#ifndef __be64
> > +#define __be64	uint64_t
> > +#endif
> > +#ifndef __u8
> > +#define __u8	uint8_t
> > +#endif
> > +#ifndef __u16
> > +#define __u16	uint16_t
> > +#endif
> > +#ifndef __u32
> > +#define __u32	uint32_t
> > +#endif
> > +#ifndef __u64
> > +#define __u64	uint64_t
> > +#endif
> > +
> > +#define FALSE	0
> > +#define TRUE	1
> > +#define false	0
> > +#define true	1
> > +
> > +#define min(a, b) RTE_MIN(a, b)
> > +#define max(a, b) RTE_MAX(a, b)
> 
> This is not Linux kernel.
> Please don't create wrappers for all the stuff in Linux to port your driver.

We actually referred several PMD's compat file including - enic_compat.h,
i40e_osdep.h, ixgbe_osdep.h, fm10k_osdep.h, etc.

Most of the types above are already defined by many of existing PMD's compat
file.  Can we at-least keep those which are already defined by several PMD's
compat file?


Thanks,
Rahul.
  
Rahul Lakkireddy May 26, 2015, 5:02 p.m. UTC | #7
On Sat, May 23, 2015 at 11:27:56 +0530, Rahul Lakkireddy wrote:
> On Fri, May 22, 2015 at 09:42:50 -0700, Stephen Hemminger wrote:
> > On Fri, 22 May 2015 18:54:20 +0530
> > Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> > 
> > > +#define pr_err(y, args...) dev_err(0, y, ##args)
> > > +#define pr_warn(y, args...) dev_warn(0, y, ##args)
> > > +#define pr_info(y, args...) dev_info(0, y, ##args)
> > > +#define BUG() pr_err("BUG at %s:%d", __func__, __LINE__)
> > > +
> > > +#define ASSERT(x) do {\
> > > +	if (!(x)) \
> > > +		rte_panic("CXGBE: x"); \
> > > +} while (0)
> > > +#define BUG_ON(x) ASSERT(!(x))
> > > +
> > > +#ifndef WARN_ON
> > > +#define WARN_ON(x) do { \
> > > +	int ret = !!(x); \
> > > +	if (unlikely(ret)) \
> > > +		pr_warn("WARN_ON: \"" #x "\" at %s:%d\n", __func__, __LINE__); \
> > > +} while (0)
> > > +#endif
> > > +
> > > +#define __iomem
> > > +
> > > +#ifndef BIT
> > > +#define BIT(n) (1 << (n))
> > > +#endif
> > > +
> > > +#define L1_CACHE_SHIFT  6
> > > +#define L1_CACHE_BYTES  BIT(L1_CACHE_SHIFT)
> > > +
> > > +#define PAGE_SHIFT  12
> > > +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
> > > +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
> > > +
> > > +#define VLAN_HLEN 4
> > > +
> > > +#define rmb()     rte_rmb() /* dpdk rte provided rmb */
> > > +#define wmb()     rte_wmb() /* dpdk rte provided wmb */
> > > +
> > > +typedef uint8_t   u8;
> > > +typedef int8_t    s8;
> > > +typedef uint16_t  u16;
> > > +typedef uint32_t  u32;
> > > +typedef int32_t   s32;
> > > +typedef uint64_t  u64;
> > > +typedef int       bool;
> > > +typedef uint64_t  dma_addr_t;
> > > +
> > > +#ifndef __le16
> > > +#define __le16	uint16_t
> > > +#endif
> > > +#ifndef __le32
> > > +#define __le32	uint32_t
> > > +#endif
> > > +#ifndef __le64
> > > +#define __le64	uint64_t
> > > +#endif
> > > +#ifndef __be16
> > > +#define __be16	uint16_t
> > > +#endif
> > > +#ifndef __be32
> > > +#define __be32	uint32_t
> > > +#endif
> > > +#ifndef __be64
> > > +#define __be64	uint64_t
> > > +#endif
> > > +#ifndef __u8
> > > +#define __u8	uint8_t
> > > +#endif
> > > +#ifndef __u16
> > > +#define __u16	uint16_t
> > > +#endif
> > > +#ifndef __u32
> > > +#define __u32	uint32_t
> > > +#endif
> > > +#ifndef __u64
> > > +#define __u64	uint64_t
> > > +#endif
> > > +
> > > +#define FALSE	0
> > > +#define TRUE	1
> > > +#define false	0
> > > +#define true	1
> > > +
> > > +#define min(a, b) RTE_MIN(a, b)
> > > +#define max(a, b) RTE_MAX(a, b)
> > 
> > This is not Linux kernel.
> > Please don't create wrappers for all the stuff in Linux to port your driver.
> 
> We actually referred several PMD's compat file including - enic_compat.h,
> i40e_osdep.h, ixgbe_osdep.h, fm10k_osdep.h, etc.
> 
> Most of the types above are already defined by many of existing PMD's compat
> file.  Can we at-least keep those which are already defined by several PMD's
> compat file?

Just to give a background - since we are new to dpdk community, we studied the
already merged PMD's compat files as reference to understand how things are
done for driver submission. And so, we wrote cxgbe compat file along similar
lines. However, if above wrappers are not acceptable then, we will definitely
remove them in V2.

Just trying to get a clarification so that we don't repeat the same mistake in
V2 submission. Reviews from you and dpdk community are more than welcome and
appreciated.


Thanks,
Rahul.
  
Stephen Hemminger May 26, 2015, 5:24 p.m. UTC | #8
On Tue, 26 May 2015 22:32:07 +0530
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:

> On Sat, May 23, 2015 at 11:27:56 +0530, Rahul Lakkireddy wrote:
> > On Fri, May 22, 2015 at 09:42:50 -0700, Stephen Hemminger wrote:
> > > On Fri, 22 May 2015 18:54:20 +0530
> > > Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> > > 
> > > > +#define pr_err(y, args...) dev_err(0, y, ##args)
> > > > +#define pr_warn(y, args...) dev_warn(0, y, ##args)
> > > > +#define pr_info(y, args...) dev_info(0, y, ##args)
> > > > +#define BUG() pr_err("BUG at %s:%d", __func__, __LINE__)
> > > > +
> > > > +#define ASSERT(x) do {\
> > > > +	if (!(x)) \
> > > > +		rte_panic("CXGBE: x"); \
> > > > +} while (0)
> > > > +#define BUG_ON(x) ASSERT(!(x))
> > > > +
> > > > +#ifndef WARN_ON
> > > > +#define WARN_ON(x) do { \
> > > > +	int ret = !!(x); \
> > > > +	if (unlikely(ret)) \
> > > > +		pr_warn("WARN_ON: \"" #x "\" at %s:%d\n", __func__, __LINE__); \
> > > > +} while (0)
> > > > +#endif
> > > > +
> > > > +#define __iomem
> > > > +
> > > > +#ifndef BIT
> > > > +#define BIT(n) (1 << (n))
> > > > +#endif
> > > > +
> > > > +#define L1_CACHE_SHIFT  6
> > > > +#define L1_CACHE_BYTES  BIT(L1_CACHE_SHIFT)
> > > > +
> > > > +#define PAGE_SHIFT  12
> > > > +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
> > > > +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
> > > > +
> > > > +#define VLAN_HLEN 4
> > > > +
> > > > +#define rmb()     rte_rmb() /* dpdk rte provided rmb */
> > > > +#define wmb()     rte_wmb() /* dpdk rte provided wmb */
> > > > +
> > > > +typedef uint8_t   u8;
> > > > +typedef int8_t    s8;
> > > > +typedef uint16_t  u16;
> > > > +typedef uint32_t  u32;
> > > > +typedef int32_t   s32;
> > > > +typedef uint64_t  u64;
> > > > +typedef int       bool;
> > > > +typedef uint64_t  dma_addr_t;
> > > > +
> > > > +#ifndef __le16
> > > > +#define __le16	uint16_t
> > > > +#endif
> > > > +#ifndef __le32
> > > > +#define __le32	uint32_t
> > > > +#endif
> > > > +#ifndef __le64
> > > > +#define __le64	uint64_t
> > > > +#endif
> > > > +#ifndef __be16
> > > > +#define __be16	uint16_t
> > > > +#endif
> > > > +#ifndef __be32
> > > > +#define __be32	uint32_t
> > > > +#endif
> > > > +#ifndef __be64
> > > > +#define __be64	uint64_t
> > > > +#endif
> > > > +#ifndef __u8
> > > > +#define __u8	uint8_t
> > > > +#endif
> > > > +#ifndef __u16
> > > > +#define __u16	uint16_t
> > > > +#endif
> > > > +#ifndef __u32
> > > > +#define __u32	uint32_t
> > > > +#endif
> > > > +#ifndef __u64
> > > > +#define __u64	uint64_t
> > > > +#endif
> > > > +
> > > > +#define FALSE	0
> > > > +#define TRUE	1
> > > > +#define false	0
> > > > +#define true	1
> > > > +
> > > > +#define min(a, b) RTE_MIN(a, b)
> > > > +#define max(a, b) RTE_MAX(a, b)
> > > 
> > > This is not Linux kernel.
> > > Please don't create wrappers for all the stuff in Linux to port your driver.
> > 
> > We actually referred several PMD's compat file including - enic_compat.h,
> > i40e_osdep.h, ixgbe_osdep.h, fm10k_osdep.h, etc.
> > 
> > Most of the types above are already defined by many of existing PMD's compat
> > file.  Can we at-least keep those which are already defined by several PMD's
> > compat file?
> 
> Just to give a background - since we are new to dpdk community, we studied the
> already merged PMD's compat files as reference to understand how things are
> done for driver submission. And so, we wrote cxgbe compat file along similar
> lines. However, if above wrappers are not acceptable then, we will definitely
> remove them in V2.
> 
> Just trying to get a clarification so that we don't repeat the same mistake in
> V2 submission. Reviews from you and dpdk community are more than welcome and
> appreciated.

Does this driver share source code with other platforms? If it does then the
compatibility wrappers make sense and reduce the maintenance effort.
If the driver is a standalone port to DPDK, then it makes sense to complete
the effort and use standard DPDK coding practices (stdint, stdbool, etc).

The other drivers in DPDK do things based on that. Many of the hardware
drivers share code with BSD. Others like the virtual drivers were written
or ported completely from scratch.
  
Rahul Lakkireddy May 26, 2015, 6:13 p.m. UTC | #9
On Tue, May 26, 2015 at 10:24:37 -0700, Stephen Hemminger wrote:
> On Tue, 26 May 2015 22:32:07 +0530
> Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> 
> > On Sat, May 23, 2015 at 11:27:56 +0530, Rahul Lakkireddy wrote:
> > > On Fri, May 22, 2015 at 09:42:50 -0700, Stephen Hemminger wrote:
> > > > On Fri, 22 May 2015 18:54:20 +0530
> > > > Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> > > > 
[...]

> > > > 
> > > > This is not Linux kernel.
> > > > Please don't create wrappers for all the stuff in Linux to port your driver.
> > > 
> > > We actually referred several PMD's compat file including - enic_compat.h,
> > > i40e_osdep.h, ixgbe_osdep.h, fm10k_osdep.h, etc.
> > > 
> > > Most of the types above are already defined by many of existing PMD's compat
> > > file.  Can we at-least keep those which are already defined by several PMD's
> > > compat file?
> > 
> > Just to give a background - since we are new to dpdk community, we studied the
> > already merged PMD's compat files as reference to understand how things are
> > done for driver submission. And so, we wrote cxgbe compat file along similar
> > lines. However, if above wrappers are not acceptable then, we will definitely
> > remove them in V2.
> > 
> > Just trying to get a clarification so that we don't repeat the same mistake in
> > V2 submission. Reviews from you and dpdk community are more than welcome and
> > appreciated.
> 
> Does this driver share source code with other platforms?

Yes. The h/w specific code is common to Linux and FBSD.
And we will be enabling FBSD support soon after this PMD gets merged and we are
able to carry out and complete testing on FBSD.


> If it does then the compatibility wrappers make sense and reduce the
> maintenance effort.
> If the driver is a standalone port to DPDK, then it makes sense to complete
> the effort and use standard DPDK coding practices (stdint, stdbool, etc).
> 
> The other drivers in DPDK do things based on that. Many of the hardware
> drivers share code with BSD. Others like the virtual drivers were written
> or ported completely from scratch.
>

Thank you for your guidance.
  
Thomas Monjalon May 27, 2015, 5:49 a.m. UTC | #10
2015-05-23 11:23, Rahul Lakkireddy:
> We need to rebase anyway since PMDs now seem to have moved to
> drivers/net directory.

Yes. And the subdirectory should probably be renamed base/.

It would be nice to introduce the PMD features in separate patches
as it was done for fm10k.

Thanks
  
Rahul Lakkireddy May 27, 2015, 11:26 a.m. UTC | #11
On Tue, May 26, 2015 at 22:49:08 -0700, Thomas Monjalon wrote:
> 2015-05-23 11:23, Rahul Lakkireddy:
> > We need to rebase anyway since PMDs now seem to have moved to
> > drivers/net directory.
> 
> Yes. And the subdirectory should probably be renamed base/.

Yes, we will do this.

> 
> It would be nice to introduce the PMD features in separate patches
> as it was done for fm10k.
> 
> Thanks

Ok. We will break this patch into several separate patches for v2 submission.


Thanks,
Rahul.
  

Patch

diff --git a/lib/librte_pmd_cxgbe/Makefile b/lib/librte_pmd_cxgbe/Makefile
new file mode 100644
index 0000000..945f196
--- /dev/null
+++ b/lib/librte_pmd_cxgbe/Makefile
@@ -0,0 +1,74 @@ 
+#   BSD LICENSE
+#
+#   Copyright(c) 2014-2015 Chelsio Communications.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Chelsio Communications nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_cxgbe.a
+
+CFLAGS += -I$(RTE_SDK)/lib/librte_pmd_cxgbe/cxgbe/
+CFLAGS += -I$(RTE_SDK)/lib/librte_pmd_cxgbe/
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+
+ifeq ($(CC), icc)
+#
+# CFLAGS for icc
+#
+CFLAGS_BASE_DRIVER = -wd174 -wd593 -wd869 -wd981 -wd2259
+else
+#
+# CFLAGS for gcc
+#
+ifeq ($(shell test $(GCC_VERSION) -ge 44 && echo 1), 1)
+CFLAGS     += -Wno-deprecated
+endif
+CFLAGS_BASE_DRIVER = -Wno-unused-parameter -Wno-unused-value
+CFLAGS_BASE_DRIVER += -Wno-strict-aliasing -Wno-format-extra-args
+
+endif
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += cxgbe_ethdev.c
+SRCS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += cxgbe_main.c
+SRCS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += sge.c
+SRCS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += cxgbe/t4_hw.c
+
+# this lib depends upon:
+DEPDIRS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += lib/librte_eal lib/librte_ether
+DEPDIRS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += lib/librte_mempool lib/librte_mbuf
+DEPDIRS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += lib/librte_net lib/librte_malloc
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_pmd_cxgbe/cxgbe.h b/lib/librte_pmd_cxgbe/cxgbe.h
new file mode 100644
index 0000000..97c37d2
--- /dev/null
+++ b/lib/librte_pmd_cxgbe/cxgbe.h
@@ -0,0 +1,60 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2014-2015 Chelsio Communications.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Chelsio Communications nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CXGBE_H_
+#define _CXGBE_H_
+
+#include "common.h"
+#include "t4_regs.h"
+
+#define CXGBE_MIN_RING_DESC_SIZE      1024 /* Min TX/RX descriptor ring size */
+#define CXGBE_MAX_RING_DESC_SIZE      4096 /* Max TX/RX descriptor ring size */
+
+#define CXGBE_DEFAULT_TX_DESC_SIZE    1024 /* Default TX ring size */
+#define CXGBE_DEFAULT_RX_DESC_SIZE    1024 /* Default RX ring size */
+
+int cxgbe_probe(struct adapter *adapter);
+int cxgbe_up(struct adapter *adap);
+int cxgbe_down(struct port_info *pi);
+void cxgbe_close(struct adapter *adapter);
+void cxgbe_stats_get(struct port_info *pi, struct port_stats *stats);
+void cxgbe_stats_reset(struct port_info *pi);
+int link_start(struct port_info *pi);
+void init_rspq(struct adapter *adap, struct sge_rspq *q, unsigned int us,
+	       unsigned int cnt, unsigned int size, unsigned int iqe_size);
+int setup_sge_fwevtq(struct adapter *adapter);
+void cfg_queues(struct rte_eth_dev *eth_dev);
+int cfg_queue_count(struct rte_eth_dev *eth_dev);
+int setup_rss(struct port_info *pi);
+
+#endif /* _CXGBE_H_ */
diff --git a/lib/librte_pmd_cxgbe/cxgbe_compat.h b/lib/librte_pmd_cxgbe/cxgbe_compat.h
new file mode 100644
index 0000000..1526659
--- /dev/null
+++ b/lib/librte_pmd_cxgbe/cxgbe_compat.h
@@ -0,0 +1,290 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2014-2015 Chelsio Communications.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Chelsio Communications nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CXGBE_COMPAT_H_
+#define _CXGBE_COMPAT_H_
+
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#include <rte_common.h>
+#include <rte_memcpy.h>
+#include <rte_byteorder.h>
+#include <rte_cycles.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+
+#define dev_printf(level, fmt, args...) \
+	RTE_LOG(level, PMD, "rte_cxgbe_pmd: " fmt, ## args)
+
+#define dev_err(x, args...) dev_printf(ERR, args)
+#define dev_info(x, args...) dev_printf(INFO, args)
+#define dev_warn(x, args...) dev_printf(WARNING, args)
+
+#ifdef RTE_LIBRTE_CXGBE_DEBUG
+#define dev_debug(x, args...) dev_printf(DEBUG, args)
+#else
+#define dev_debug(x, args...) do { } while (0)
+#endif
+
+#ifdef RTE_LIBRTE_CXGBE_DEBUG_REG
+#define CXGBE_DEBUG_REG(x, args...) dev_printf(DEBUG, "REG:" args)
+#else
+#define CXGBE_DEBUG_REG(x, args...) do { } while (0)
+#endif
+
+#ifdef RTE_LIBRTE_CXGBE_DEBUG_MBOX
+#define CXGBE_DEBUG_MBOX(x, args...) dev_printf(DEBUG, "MBOX:" args)
+#else
+#define CXGBE_DEBUG_MBOX(x, args...) do { } while (0)
+#endif
+
+#ifdef RTE_LIBRTE_CXGBE_DEBUG_TX
+#define CXGBE_DEBUG_TX(x, args...) dev_printf(DEBUG, "TX:" args)
+#else
+#define CXGBE_DEBUG_TX(x, args...) do { } while (0)
+#endif
+
+#ifdef RTE_LIBRTE_CXGBE_DEBUG_RX
+#define CXGBE_DEBUG_RX(x, args...) dev_printf(DEBUG, "RX:" args)
+#else
+#define CXGBE_DEBUG_RX(x, args...) do { } while (0)
+#endif
+
+#ifdef RTE_LIBRTE_CXGBE_DEBUG
+#define CXGBE_FUNC_TRACE() \
+	RTE_LOG(DEBUG, PMD, "CXGBE trace: %s\n", __func__)
+#else
+#define CXGBE_FUNC_TRACE() do { } while (0)
+#endif
+
+#define pr_err(y, args...) dev_err(0, y, ##args)
+#define pr_warn(y, args...) dev_warn(0, y, ##args)
+#define pr_info(y, args...) dev_info(0, y, ##args)
+#define BUG() pr_err("BUG at %s:%d", __func__, __LINE__)
+
+#define ASSERT(x) do {\
+	if (!(x)) \
+		rte_panic("CXGBE: x"); \
+} while (0)
+#define BUG_ON(x) ASSERT(!(x))
+
+#ifndef WARN_ON
+#define WARN_ON(x) do { \
+	int ret = !!(x); \
+	if (unlikely(ret)) \
+		pr_warn("WARN_ON: \"" #x "\" at %s:%d\n", __func__, __LINE__); \
+} while (0)
+#endif
+
+#define __iomem
+
+#ifndef BIT
+#define BIT(n) (1 << (n))
+#endif
+
+#define L1_CACHE_SHIFT  6
+#define L1_CACHE_BYTES  BIT(L1_CACHE_SHIFT)
+
+#define PAGE_SHIFT  12
+#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
+#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
+
+#define VLAN_HLEN 4
+
+#define rmb()     rte_rmb() /* dpdk rte provided rmb */
+#define wmb()     rte_wmb() /* dpdk rte provided wmb */
+
+typedef uint8_t   u8;
+typedef int8_t    s8;
+typedef uint16_t  u16;
+typedef uint32_t  u32;
+typedef int32_t   s32;
+typedef uint64_t  u64;
+typedef int       bool;
+typedef uint64_t  dma_addr_t;
+
+#ifndef __le16
+#define __le16	uint16_t
+#endif
+#ifndef __le32
+#define __le32	uint32_t
+#endif
+#ifndef __le64
+#define __le64	uint64_t
+#endif
+#ifndef __be16
+#define __be16	uint16_t
+#endif
+#ifndef __be32
+#define __be32	uint32_t
+#endif
+#ifndef __be64
+#define __be64	uint64_t
+#endif
+#ifndef __u8
+#define __u8	uint8_t
+#endif
+#ifndef __u16
+#define __u16	uint16_t
+#endif
+#ifndef __u32
+#define __u32	uint32_t
+#endif
+#ifndef __u64
+#define __u64	uint64_t
+#endif
+
+#define FALSE	0
+#define TRUE	1
+#define false	0
+#define true	1
+
+#define min(a, b) RTE_MIN(a, b)
+#define max(a, b) RTE_MAX(a, b)
+
+/*
+ * round up val _p to a power of 2 size _s
+ */
+#define roundup(_p, _s) (((unsigned long)(_p) + (_s - 1)) & ~(_s - 1))
+
+#undef container_of
+#define container_of(ptr, type, member) ({ \
+		typeof(((type *)0)->member)(*__mptr) = (ptr); \
+		(type *)((char *)__mptr - offsetof(type, member)); })
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#define cpu_to_be16(o) rte_cpu_to_be_16(o)
+#define cpu_to_be32(o) rte_cpu_to_be_32(o)
+#define cpu_to_be64(o) rte_cpu_to_be_64(o)
+#define cpu_to_le32(o) rte_cpu_to_le_32(o)
+#define be16_to_cpu(o) rte_be_to_cpu_16(o)
+#define be32_to_cpu(o) rte_be_to_cpu_32(o)
+#define be64_to_cpu(o) rte_be_to_cpu_64(o)
+#define le32_to_cpu(o) rte_le_to_cpu_32(o)
+
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#define DELAY(x) rte_delay_us(x)
+#define udelay(x) DELAY(x)
+#define msleep(x) DELAY(1000 * (x))
+#define usleep_range(min, max) msleep(DIV_ROUND_UP(min, 1000))
+
+static inline uint8_t hweight32(uint32_t word32)
+{
+	uint32_t res = word32 - ((word32 >> 1) & 0x55555555);
+
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res + (res >> 4)) & 0x0F0F0F0F;
+	res = res + (res >> 8);
+	return (res + (res >> 16)) & 0x000000FF;
+
+} /* weight32 */
+
+/**
+ * fls - find last (most-significant) bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs.
+ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
+ */
+static inline int fls(int x)
+{
+	int r = 32;
+
+	if (!x)
+		return 0;
+	if (!(x & 0xffff0000u)) {
+		x <<= 16;
+		r -= 16;
+	}
+	if (!(x & 0xff000000u)) {
+		x <<= 8;
+		r -= 8;
+	}
+	if (!(x & 0xf0000000u)) {
+		x <<= 4;
+		r -= 4;
+	}
+	if (!(x & 0xc0000000u)) {
+		x <<= 2;
+		r -= 2;
+	}
+	if (!(x & 0x80000000u)) {
+		x <<= 1;
+		r -= 1;
+	}
+	return r;
+}
+
+static inline unsigned long ilog2(unsigned long n)
+{
+	unsigned int e = 0;
+
+	while (n) {
+		if (n & ~((1 << 8) - 1)) {
+			e += 8;
+			n >>= 8;
+			continue;
+		}
+
+		if (n & ~((1 << 4) - 1)) {
+			e += 4;
+			n >>= 4;
+		}
+
+		for (;;) {
+			n >>= 1;
+			if (n == 0)
+				break;
+			e++;
+		}
+	}
+
+	return e;
+}
+
+static inline void writel(unsigned int val, volatile void __iomem *addr)
+{
+	*(volatile unsigned int *)addr = val;
+}
+
+static inline void writeq(u64 val, volatile void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, (void *)((uintptr_t)addr + 4));
+}
+
+#endif /* _CXGBE_COMPAT_H_ */
diff --git a/lib/librte_pmd_cxgbe/cxgbe_ethdev.c b/lib/librte_pmd_cxgbe/cxgbe_ethdev.c
new file mode 100644
index 0000000..aa2c50e
--- /dev/null
+++ b/lib/librte_pmd_cxgbe/cxgbe_ethdev.c
@@ -0,0 +1,796 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2014-2015 Chelsio Communications.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Chelsio Communications nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_interrupts.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_pci.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_alarm.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_atomic.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_dev.h>
+
+#include "cxgbe.h"
+
+/*
+ * Macros needed to support the PCI Device ID Table ...
+ */
+#define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \
+	static struct rte_pci_id cxgb4_pci_tbl[] = {
+#define CH_PCI_DEVICE_ID_FUNCTION 0x4
+
+#define PCI_VENDOR_ID_CHELSIO 0x1425
+
+#define CH_PCI_ID_TABLE_ENTRY(devid) \
+		{ RTE_PCI_DEVICE(PCI_VENDOR_ID_CHELSIO, (devid)) }
+
+#define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \
+		{ .vendor_id = 0, } \
+	}
+
+/*
+ *... and the PCI ID Table itself ...
+ */
+#include "t4_pci_id_tbl.h"
+
+static uint16_t cxgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
+				uint16_t nb_pkts)
+{
+	struct sge_eth_txq *txq = (struct sge_eth_txq *)tx_queue;
+	uint16_t pkts_sent, pkts_remain;
+	uint16_t total_sent = 0;
+	int ret = 0;
+
+	CXGBE_DEBUG_TX(adapter, "%s: txq = %p; tx_pkts = %p; nb_pkts = %d\n",
+		       __func__, txq, tx_pkts, nb_pkts);
+
+	/* free up desc from already completed tx */
+	reclaim_completed_tx(&txq->q);
+	while (total_sent < nb_pkts) {
+		pkts_remain = nb_pkts - total_sent;
+
+		for (pkts_sent = 0; pkts_sent < pkts_remain; pkts_sent++) {
+			ret = t4_eth_xmit(txq, tx_pkts[total_sent + pkts_sent],
+					  pkts_remain - pkts_sent - 1);
+			if (ret < 0)
+				break;
+		}
+		if (!pkts_sent)
+			break;
+		total_sent += pkts_sent;
+		/* reclaim as much as possible */
+		reclaim_completed_tx(&txq->q);
+	}
+	return total_sent;
+}
+
+static uint16_t cxgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
+				uint16_t nb_pkts)
+{
+	struct sge_eth_rxq *rxq = (struct sge_eth_rxq *)rx_queue;
+	unsigned int work_done;
+
+	CXGBE_DEBUG_RX(adapter, "%s: rxq->rspq.cntxt_id = %u; nb_pkts = %d\n",
+		       __func__, rxq->rspq.cntxt_id, nb_pkts);
+
+	if (cxgbe_poll(&rxq->rspq, rx_pkts, (unsigned int)nb_pkts, &work_done))
+		dev_err(adapter, "error in cxgbe poll\n");
+
+	CXGBE_DEBUG_RX(adapter, "%s: work_done = %u\n", __func__, work_done);
+	return work_done;
+}
+
+static void cxgbe_dev_info_get(struct rte_eth_dev *eth_dev,
+			       struct rte_eth_dev_info *device_info)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	int max_queues = adapter->sge.max_ethqsets / adapter->params.nports;
+
+	device_info->min_rx_bufsize = 68; /* XXX: Smallest pkt size */
+	device_info->max_rx_pktlen = 1500; /* XXX: For now we support mtu */
+	device_info->max_rx_queues = max_queues;
+	device_info->max_tx_queues = max_queues;
+	device_info->max_mac_addrs = 1;
+	/* XXX: For now we support one MAC/port */
+	device_info->max_vfs = adapter->params.arch.vfcount;
+	device_info->max_vmdq_pools = 0; /* XXX: For now no support for VMDQ */
+
+	device_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP |
+				       DEV_RX_OFFLOAD_IPV4_CKSUM |
+				       DEV_RX_OFFLOAD_UDP_CKSUM |
+				       DEV_RX_OFFLOAD_TCP_CKSUM;
+
+	device_info->tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT |
+				       DEV_TX_OFFLOAD_IPV4_CKSUM |
+				       DEV_TX_OFFLOAD_UDP_CKSUM |
+				       DEV_TX_OFFLOAD_TCP_CKSUM |
+				       DEV_TX_OFFLOAD_TCP_TSO;
+
+	device_info->reta_size = pi->rss_size;
+}
+
+static void cxgbe_dev_promiscuous_enable(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+
+	t4_set_rxmode(adapter, adapter->mbox, pi->viid, -1,
+		      1, -1, 1, -1, false);
+}
+
+static void cxgbe_dev_promiscuous_disable(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+
+	t4_set_rxmode(adapter, adapter->mbox, pi->viid, -1,
+		      0, -1, 1, -1, false);
+}
+
+static void cxgbe_dev_allmulticast_enable(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+
+	/* TODO: address filters ?? */
+
+	t4_set_rxmode(adapter, adapter->mbox, pi->viid, -1,
+		      -1, 1, 1, -1, false);
+}
+
+static void cxgbe_dev_allmulticast_disable(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+
+	/* TODO: address filters ?? */
+
+	t4_set_rxmode(adapter, adapter->mbox, pi->viid, -1,
+		      -1, 0, 1, -1, false);
+}
+
+static int cxgbe_dev_link_update(struct rte_eth_dev *eth_dev,
+				 __rte_unused int wait_to_complete)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+	unsigned int work_done, budget = 4;
+	int ret;
+
+	cxgbe_poll(&s->fw_evtq, NULL, budget, &work_done);
+	ret = pi->link_cfg.link_ok;
+	eth_dev->data->dev_link.link_status = pi->link_cfg.link_ok;
+	eth_dev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
+	eth_dev->data->dev_link.link_speed = pi->link_cfg.speed;
+	return ret;
+}
+
+static int cxgbe_dev_tx_queue_start(struct rte_eth_dev *eth_dev,
+				    uint16_t tx_queue_id);
+static int cxgbe_dev_rx_queue_start(struct rte_eth_dev *eth_dev,
+				    uint16_t tx_queue_id);
+static void cxgbe_dev_tx_queue_release(void *q);
+static void cxgbe_dev_rx_queue_release(void *q);
+
+/*
+ * Stop device.
+ */
+static void cxgbe_dev_close(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	int i, dev_down = 0;
+
+	CXGBE_FUNC_TRACE();
+
+	if (!(adapter->flags & FULL_INIT_DONE))
+		return;
+
+	cxgbe_down(pi);
+
+	/*
+	 *  We clear queues only if both tx and rx path of the port
+	 *  have been disabled
+	 */
+	t4_sge_eth_clear_queues(pi);
+
+	/*  See if all ports are down */
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+		/*
+		 * Skip first port of the adapter since it will be closed
+		 * by DPDK
+		 */
+		if (i == 0)
+			continue;
+		dev_down += (pi->eth_dev->data->dev_started == 0) ? 1 : 0;
+	}
+
+	/* If rest of the ports are stopped, then free up resources */
+	if (dev_down == (adapter->params.nports - 1))
+		cxgbe_close(adapter);
+}
+
+/* Start the device.
+ * It returns 0 on success.
+ */
+static int cxgbe_dev_start(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	int err = 0, i;
+
+	CXGBE_FUNC_TRACE();
+
+	/*
+	 * If we don't have a connection to the firmware there's nothing we
+	 * can do.
+	 */
+	if (!(adapter->flags & FW_OK)) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	if (!(adapter->flags & FULL_INIT_DONE)) {
+		err = cxgbe_up(adapter);
+		if (err < 0)
+			goto out;
+	}
+
+	err = setup_rss(pi);
+	if (err)
+		goto out;
+
+	for (i = 0; i < pi->n_tx_qsets; i++) {
+		err = cxgbe_dev_tx_queue_start(eth_dev, i);
+		if (err)
+			goto out;
+	}
+
+	for (i = 0; i < pi->n_rx_qsets; i++) {
+		err = cxgbe_dev_rx_queue_start(eth_dev, i);
+		if (err)
+			goto out;
+	}
+
+	err = link_start(pi);
+	if (err)
+		goto out;
+
+out:
+	return err;
+}
+
+/*
+ * Stop device: disable rx and tx functions to allow for reconfiguring.
+ */
+static void cxgbe_dev_stop(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+
+	CXGBE_FUNC_TRACE();
+
+	if (!(adapter->flags & FULL_INIT_DONE))
+		return;
+
+	cxgbe_down(pi);
+
+	/*
+	 *  We clear queues only if both tx and rx path of the port
+	 *  have been disabled
+	 */
+	t4_sge_eth_clear_queues(pi);
+}
+
+static int cxgbe_dev_configure(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	int err;
+
+	CXGBE_FUNC_TRACE();
+
+	if (!(adapter->flags & FW_QUEUE_BOUND)) {
+		err = setup_sge_fwevtq(adapter);
+		if (err)
+			return err;
+		adapter->flags |= FW_QUEUE_BOUND;
+	}
+
+	err = cfg_queue_count(eth_dev);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int cxgbe_dev_tx_queue_start(struct rte_eth_dev *eth_dev,
+				    uint16_t tx_queue_id)
+{
+	struct sge_eth_txq *txq = (struct sge_eth_txq *)
+				  (eth_dev->data->tx_queues[tx_queue_id]);
+
+	dev_debug(NULL, "%s: tx_queue_id = %d\n", __func__, tx_queue_id);
+
+	return t4_sge_eth_txq_start(txq);
+}
+
+static int cxgbe_dev_tx_queue_stop(struct rte_eth_dev *eth_dev,
+				   uint16_t tx_queue_id)
+{
+	struct sge_eth_txq *txq = (struct sge_eth_txq *)
+				  (eth_dev->data->tx_queues[tx_queue_id]);
+
+	dev_debug(NULL, "%s: tx_queue_id = %d\n", __func__, tx_queue_id);
+
+	return t4_sge_eth_txq_stop(txq);
+}
+
+static int cxgbe_dev_tx_queue_setup(struct rte_eth_dev *eth_dev,
+				    uint16_t queue_idx,	uint16_t nb_desc,
+				    unsigned int socket_id,
+				    const struct rte_eth_txconf *tx_conf)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+	struct sge_eth_txq *txq = &s->ethtxq[pi->first_qset + queue_idx];
+	int err = 0;
+	unsigned int temp_nb_desc;
+
+	RTE_SET_USED(tx_conf);
+
+	dev_debug(adapter, "%s: eth_dev->data->nb_tx_queues = %d; queue_idx = %d; nb_desc = %d; socket_id = %d; pi->first_qset = %u\n",
+		  __func__, eth_dev->data->nb_tx_queues, queue_idx, nb_desc,
+		  socket_id, pi->first_qset);
+
+	/*  Free up the existing queue  */
+	if (eth_dev->data->tx_queues[queue_idx]) {
+		cxgbe_dev_tx_queue_release(eth_dev->data->tx_queues[queue_idx]);
+		eth_dev->data->tx_queues[queue_idx] = NULL;
+	}
+
+	eth_dev->data->tx_queues[queue_idx] = (void *)txq;
+
+	/* Sanity Checking
+	 *
+	 * nb_desc should be > 1023 and <= CXGBE_MAX_RING_DESC_SIZE
+	 */
+	temp_nb_desc = nb_desc;
+	if (nb_desc < CXGBE_MIN_RING_DESC_SIZE) {
+		dev_warn(adapter, "%s: number of descriptors must be >= %d. Using default [%d]\n",
+			 __func__, CXGBE_MIN_RING_DESC_SIZE,
+			 CXGBE_DEFAULT_TX_DESC_SIZE);
+		temp_nb_desc = CXGBE_DEFAULT_TX_DESC_SIZE;
+	} else if (nb_desc > CXGBE_MAX_RING_DESC_SIZE) {
+		dev_err(adapter, "%s: number of descriptors must be between %d and %d inclusive. Default [%d]\n",
+			__func__, CXGBE_MIN_RING_DESC_SIZE,
+			CXGBE_MAX_RING_DESC_SIZE, CXGBE_DEFAULT_TX_DESC_SIZE);
+		return -(EINVAL);
+	}
+
+	txq->q.size = temp_nb_desc;
+
+	err = t4_sge_alloc_eth_txq(adapter, txq, eth_dev, queue_idx,
+				   s->fw_evtq.cntxt_id, socket_id);
+
+	dev_debug(adapter, "%s: txq->q.cntxt_id= %d err = %d\n",
+		  __func__, txq->q.cntxt_id, err);
+
+	return err;
+}
+
+static void cxgbe_dev_tx_queue_release(void *q)
+{
+	struct sge_eth_txq *txq = (struct sge_eth_txq *)q;
+
+	if (txq) {
+		struct port_info *pi = (struct port_info *)
+				       (txq->eth_dev->data->dev_private);
+		struct adapter *adap = pi->adapter;
+
+		dev_debug(adapter, "%s: pi->port_id = %d; tx_queue_id = %d\n",
+			  __func__, pi->port_id, txq->q.cntxt_id);
+
+		t4_sge_eth_txq_release(adap, txq);
+	}
+}
+
+static int cxgbe_dev_rx_queue_start(struct rte_eth_dev *eth_dev,
+				    uint16_t rx_queue_id)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adap = pi->adapter;
+	struct sge_rspq *q;
+
+	dev_debug(adapter, "%s: pi->port_id = %d; rx_queue_id = %d\n",
+		  __func__, pi->port_id, rx_queue_id);
+
+	q = eth_dev->data->rx_queues[rx_queue_id];
+	return t4_sge_eth_rxq_start(adap, q);
+}
+
+static int cxgbe_dev_rx_queue_stop(struct rte_eth_dev *eth_dev,
+				   uint16_t rx_queue_id)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adap = pi->adapter;
+	struct sge_rspq *q;
+
+	dev_debug(adapter, "%s: pi->port_id = %d; rx_queue_id = %d\n",
+		  __func__, pi->port_id, rx_queue_id);
+
+	q = eth_dev->data->rx_queues[rx_queue_id];
+	return t4_sge_eth_rxq_stop(adap, q);
+}
+
+static int cxgbe_dev_rx_queue_setup(struct rte_eth_dev *eth_dev,
+				    uint16_t queue_idx,	uint16_t nb_desc,
+				    unsigned int socket_id,
+				    const struct rte_eth_rxconf *rx_conf,
+				    struct rte_mempool *mp)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+	struct sge_eth_rxq *rxq = &s->ethrxq[pi->first_qset + queue_idx];
+	int err = 0;
+	int msi_idx = 0;
+	unsigned int temp_nb_desc;
+
+	RTE_SET_USED(rx_conf);
+
+	dev_debug(adapter, "%s: eth_dev->data->nb_rx_queues = %d; queue_idx = %d; nb_desc = %d; socket_id = %d; mp = %p\n",
+		  __func__, eth_dev->data->nb_rx_queues, queue_idx, nb_desc,
+		  socket_id, mp);
+
+	/*  Free up the existing queue  */
+	if (eth_dev->data->rx_queues[queue_idx]) {
+		cxgbe_dev_rx_queue_release(eth_dev->data->rx_queues[queue_idx]);
+		eth_dev->data->rx_queues[queue_idx] = NULL;
+	}
+
+	eth_dev->data->rx_queues[queue_idx] = (void *)rxq;
+
+	/* Sanity Checking
+	 *
+	 * nb_desc should be > 0 and <= CXGBE_MAX_RING_DESC_SIZE
+	 */
+	temp_nb_desc = nb_desc;
+	if (nb_desc < CXGBE_MIN_RING_DESC_SIZE) {
+		dev_warn(adapter, "%s: number of descriptors must be >= %d. Using default [%d]\n",
+			 __func__, CXGBE_MIN_RING_DESC_SIZE,
+			 CXGBE_DEFAULT_RX_DESC_SIZE);
+		temp_nb_desc = CXGBE_DEFAULT_RX_DESC_SIZE;
+	} else if (nb_desc > CXGBE_MAX_RING_DESC_SIZE) {
+		dev_err(adapter, "%s: number of descriptors must be between %d and %d inclusive. Default [%d]\n",
+			__func__, CXGBE_MIN_RING_DESC_SIZE,
+			CXGBE_MAX_RING_DESC_SIZE, CXGBE_DEFAULT_RX_DESC_SIZE);
+		return -(EINVAL);
+	}
+
+	rxq->rspq.size = temp_nb_desc;
+	if ((&rxq->fl) != NULL)
+		rxq->fl.size = temp_nb_desc;
+
+	err = t4_sge_alloc_rxq(adapter, &rxq->rspq, false, eth_dev, msi_idx,
+			       &rxq->fl, t4_ethrx_handler,
+			       t4_get_mps_bg_map(adapter, pi->tx_chan), mp,
+			       queue_idx, socket_id);
+
+	dev_debug(adapter, "%s: err = %d; port_id = %d; cntxt_id = %u\n",
+		  __func__, err, pi->port_id, rxq->rspq.cntxt_id);
+	return err;
+}
+
+static void cxgbe_dev_rx_queue_release(void *q)
+{
+	struct sge_eth_rxq *rxq = (struct sge_eth_rxq *)q;
+	struct sge_rspq *rq = &rxq->rspq;
+
+	if (rq) {
+		struct port_info *pi = (struct port_info *)
+				       (rq->eth_dev->data->dev_private);
+		struct adapter *adap = pi->adapter;
+
+		dev_debug(adapter, "%s: pi->port_id = %d; rx_queue_id = %d\n",
+			  __func__, pi->port_id, rxq->rspq.cntxt_id);
+
+		t4_sge_eth_rxq_release(adap, rxq);
+	}
+}
+
+/*
+ * Get port statistics.
+ */
+static void cxgbe_dev_stats_get(struct rte_eth_dev *eth_dev,
+				struct rte_eth_stats *eth_stats)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+	struct port_stats ps;
+	unsigned int i;
+
+	cxgbe_stats_get(pi, &ps);
+
+	/* RX Stats */
+	eth_stats->ipackets = ps.rx_frames;
+	eth_stats->ibytes   = ps.rx_octets;
+	eth_stats->imcasts  = ps.rx_mcast_frames;
+	eth_stats->imissed  = ps.rx_ovflow0 + ps.rx_ovflow1 +
+			      ps.rx_ovflow2 + ps.rx_ovflow3 +
+			      ps.rx_trunc0 + ps.rx_trunc1 +
+			      ps.rx_trunc2 + ps.rx_trunc3;
+	eth_stats->ibadcrc  = ps.rx_fcs_err;
+	eth_stats->ibadlen  = ps.rx_jabber + ps.rx_too_long + ps.rx_runt;
+	eth_stats->ierrors  = ps.rx_symbol_err + eth_stats->ibadcrc +
+			      eth_stats->ibadlen + ps.rx_len_err +
+			      eth_stats->imissed;
+	eth_stats->rx_pause_xon  = ps.rx_pause;
+
+	/* TX Stats */
+	eth_stats->opackets = ps.tx_frames;
+	eth_stats->obytes   = ps.tx_octets;
+	eth_stats->oerrors  = ps.tx_error_frames;
+	eth_stats->tx_pause_xon  = ps.tx_pause;
+
+	for (i = 0; i < pi->n_rx_qsets; i++) {
+		struct sge_eth_rxq *rxq =
+			&s->ethrxq[pi->first_qset + i];
+
+		eth_stats->q_ipackets[i] = rxq->stats.pkts;
+		eth_stats->q_ibytes[i] = rxq->stats.rx_bytes;
+	}
+
+	for (i = 0; i < pi->n_tx_qsets; i++) {
+		struct sge_eth_txq *txq =
+			&s->ethtxq[pi->first_qset + i];
+
+		eth_stats->q_opackets[i] = txq->stats.pkts;
+		eth_stats->q_obytes[i] = txq->stats.tx_bytes;
+		eth_stats->q_errors[i] = txq->stats.mapping_err;
+	}
+}
+
+/*
+ * Reset port statistics.
+ */
+static void cxgbe_dev_stats_reset(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+	unsigned int i;
+
+	cxgbe_stats_reset(pi);
+	for (i = 0; i < pi->n_rx_qsets; i++) {
+		struct sge_eth_rxq *rxq =
+			&s->ethrxq[pi->first_qset + i];
+
+		rxq->stats.pkts = 0;
+		rxq->stats.rx_bytes = 0;
+	}
+	for (i = 0; i < pi->n_tx_qsets; i++) {
+		struct sge_eth_txq *txq =
+			&s->ethtxq[pi->first_qset + i];
+
+		txq->stats.pkts = 0;
+		txq->stats.tx_bytes = 0;
+		txq->stats.mapping_err = 0;
+	}
+}
+
+static int cxgbe_flow_ctrl_get(struct rte_eth_dev *eth_dev,
+			       struct rte_eth_fc_conf *fc_conf)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct link_config *lc = &pi->link_cfg;
+	int rx_pause, tx_pause;
+
+	fc_conf->autoneg = lc->fc & PAUSE_AUTONEG;
+	rx_pause = lc->fc & PAUSE_RX;
+	tx_pause = lc->fc & PAUSE_TX;
+
+	if (rx_pause && tx_pause)
+		fc_conf->mode = RTE_FC_FULL;
+	else if (rx_pause)
+		fc_conf->mode = RTE_FC_RX_PAUSE;
+	else if (tx_pause)
+		fc_conf->mode = RTE_FC_TX_PAUSE;
+	else
+		fc_conf->mode = RTE_FC_NONE;
+	return 0;
+}
+
+static int cxgbe_flow_ctrl_set(struct rte_eth_dev *eth_dev,
+			       struct rte_eth_fc_conf *fc_conf)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = pi->adapter;
+	struct link_config *lc = &pi->link_cfg;
+
+	if (lc->supported & FW_PORT_CAP_ANEG) {
+		if (fc_conf->autoneg)
+			lc->requested_fc |= PAUSE_AUTONEG;
+		else
+			lc->requested_fc &= ~PAUSE_AUTONEG;
+	}
+
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_RX_PAUSE))
+		lc->requested_fc |= PAUSE_RX;
+	else
+		lc->requested_fc &= ~PAUSE_RX;
+
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_TX_PAUSE))
+		lc->requested_fc |= PAUSE_TX;
+	else
+		lc->requested_fc &= ~PAUSE_TX;
+
+	return t4_link_l1cfg(adapter, adapter->mbox, pi->tx_chan,
+			     &pi->link_cfg);
+}
+
+static struct eth_dev_ops cxgbe_eth_dev_ops = {
+	.dev_start		= cxgbe_dev_start,
+	.dev_stop		= cxgbe_dev_stop,
+	.dev_close		= cxgbe_dev_close,
+	.promiscuous_enable	= cxgbe_dev_promiscuous_enable,
+	.promiscuous_disable	= cxgbe_dev_promiscuous_disable,
+	.allmulticast_enable	= cxgbe_dev_allmulticast_enable,
+	.allmulticast_disable	= cxgbe_dev_allmulticast_disable,
+	.dev_configure		= cxgbe_dev_configure,
+	.dev_infos_get		= cxgbe_dev_info_get,
+	.link_update		= cxgbe_dev_link_update,
+	.tx_queue_setup         = cxgbe_dev_tx_queue_setup,
+	.tx_queue_start		= cxgbe_dev_tx_queue_start,
+	.tx_queue_stop		= cxgbe_dev_tx_queue_stop,
+	.tx_queue_release	= cxgbe_dev_tx_queue_release,
+	.rx_queue_setup         = cxgbe_dev_rx_queue_setup,
+	.rx_queue_start		= cxgbe_dev_rx_queue_start,
+	.rx_queue_stop		= cxgbe_dev_rx_queue_stop,
+	.rx_queue_release	= cxgbe_dev_rx_queue_release,
+	.stats_get		= cxgbe_dev_stats_get,
+	.stats_reset		= cxgbe_dev_stats_reset,
+	.flow_ctrl_get		= cxgbe_flow_ctrl_get,
+	.flow_ctrl_set		= cxgbe_flow_ctrl_set,
+};
+
+/*
+ * Initialize driver
+ * It returns 0 on success.
+ */
+static int eth_cxgbe_dev_init(struct rte_eth_dev *eth_dev)
+{
+	struct rte_pci_device *pci_dev;
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adapter = NULL;
+	char name[RTE_ETH_NAME_MAX_LEN];
+	int err = 0;
+
+	CXGBE_FUNC_TRACE();
+
+	eth_dev->dev_ops = &cxgbe_eth_dev_ops;
+	eth_dev->rx_pkt_burst = &cxgbe_recv_pkts;
+	eth_dev->tx_pkt_burst = &cxgbe_xmit_pkts;
+
+	/* for secondary processes, we don't initialise any further as primary
+	 * has already done this work.
+	 */
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return 0;
+
+	pci_dev = eth_dev->pci_dev;
+	snprintf(name, sizeof(name), "cxgbeadapter%d", eth_dev->data->port_id);
+	adapter = rte_zmalloc(name, sizeof(*adapter), 0);
+	if (!adapter)
+		return -1;
+
+	adapter->use_unpacked_mode = 1;
+	adapter->regs = (void *)pci_dev->mem_resource[0].addr;
+	if (!adapter->regs) {
+		dev_err(adapter, "%s: cannot map device registers\n", __func__);
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+	adapter->pdev = pci_dev;
+	adapter->eth_dev = eth_dev;
+	pi->adapter = adapter;
+
+	err = cxgbe_probe(adapter);
+	if (err)
+		dev_err(adapter, "%s: cxgbe probe failed with err %d\n",
+			__func__, err);
+
+out_free_adapter:
+	return err;
+}
+
+static struct eth_driver rte_cxgbe_pmd = {
+	{
+		.name = "rte_cxgbe_pmd",
+		.id_table = cxgb4_pci_tbl,
+		.drv_flags = RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC,
+	},
+	.eth_dev_init = eth_cxgbe_dev_init,
+	.dev_private_size = sizeof(struct port_info),
+};
+
+/*
+ * Driver initialization routine.
+ * Invoked once at EAL init time.
+ * Register itself as the [Poll Mode] Driver of PCI CXGBE devices.
+ */
+static int rte_cxgbe_pmd_init(const char *name __rte_unused,
+			      const char *params __rte_unused)
+{
+	CXGBE_FUNC_TRACE();
+
+	rte_eth_driver_register(&rte_cxgbe_pmd);
+	return 0;
+}
+
+static struct rte_driver rte_cxgbe_driver = {
+	.name = "cxgbe_driver",
+	.type = PMD_PDEV,
+	.init = rte_cxgbe_pmd_init,
+};
+
+PMD_REGISTER_DRIVER(rte_cxgbe_driver);
diff --git a/lib/librte_pmd_cxgbe/cxgbe_main.c b/lib/librte_pmd_cxgbe/cxgbe_main.c
new file mode 100644
index 0000000..b39a798
--- /dev/null
+++ b/lib/librte_pmd_cxgbe/cxgbe_main.c
@@ -0,0 +1,1219 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2014-2015 Chelsio Communications.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Chelsio Communications nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_interrupts.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_pci.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_alarm.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_atomic.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_dev.h>
+
+#include "common.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "cxgbe.h"
+
+/*
+ * Response queue handler for the FW event queue.
+ */
+static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
+			  __rte_unused const struct pkt_gl *gl)
+{
+	u8 opcode = ((const struct rss_header *)rsp)->opcode;
+
+	rsp++;                                          /* skip RSS header */
+
+	/*
+	 * FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
+	 */
+	if (unlikely(opcode == CPL_FW4_MSG &&
+		     ((const struct cpl_fw4_msg *)rsp)->type ==
+		      FW_TYPE_RSSCPL)) {
+		rsp++;
+		opcode = ((const struct rss_header *)rsp)->opcode;
+		rsp++;
+		if (opcode != CPL_SGE_EGR_UPDATE) {
+			dev_err(q->adapter, "unexpected FW4/CPL %#x on FW event queue\n",
+				opcode);
+			goto out;
+		}
+	}
+
+	if (likely(opcode == CPL_SGE_EGR_UPDATE)) {
+		/* do nothing */
+	} else if (opcode == CPL_FW6_MSG || opcode == CPL_FW4_MSG) {
+		const struct cpl_fw6_msg *msg = (const void *)rsp;
+
+		t4_handle_fw_rpl(q->adapter, msg->data);
+	} else {
+		dev_err(adapter, "unexpected CPL %#x on FW event queue\n",
+			opcode);
+	}
+out:
+	return 0;
+}
+
+int setup_sge_fwevtq(struct adapter *adapter)
+{
+	struct sge *s = &adapter->sge;
+	int err = 0;
+	int msi_idx = 0;
+
+	err = t4_sge_alloc_rxq(adapter, &s->fw_evtq, true, adapter->eth_dev,
+			       msi_idx, NULL, fwevtq_handler, -1, NULL, 0,
+			       rte_socket_id());
+	return err;
+}
+
+static int closest_timer(const struct sge *s, int time)
+{
+	unsigned int i, match = 0;
+	int delta, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->timer_val); i++) {
+		delta = time - s->timer_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+static int closest_thres(const struct sge *s, int thres)
+{
+	unsigned int i, match = 0;
+	int delta, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->counter_val); i++) {
+		delta = thres - s->counter_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+/**
+ * cxgb4_set_rspq_intr_params - set a queue's interrupt holdoff parameters
+ * @q: the Rx queue
+ * @us: the hold-off time in us, or 0 to disable timer
+ * @cnt: the hold-off packet count, or 0 to disable counter
+ *
+ * Sets an Rx queue's interrupt hold-off time and packet count.  At least
+ * one of the two needs to be enabled for the queue to generate interrupts.
+ */
+int cxgb4_set_rspq_intr_params(struct sge_rspq *q, unsigned int us,
+			       unsigned int cnt)
+{
+	struct adapter *adap = q->adapter;
+	unsigned int timer_val;
+
+	if (cnt) {
+		int err;
+		u32 v, new_idx;
+
+		new_idx = closest_thres(&adap->sge, cnt);
+		if (q->desc && q->pktcnt_idx != new_idx) {
+			/* the queue has already been created, update it */
+			v = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
+			    V_FW_PARAMS_PARAM_X(
+			    FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH) |
+			    V_FW_PARAMS_PARAM_YZ(q->cntxt_id);
+			err = t4_set_params(adap, adap->mbox, adap->pf, 0, 1,
+					    &v, &new_idx);
+			if (err)
+				return err;
+		}
+		q->pktcnt_idx = new_idx;
+	}
+
+	timer_val = (us == 0) ? X_TIMERREG_RESTART_COUNTER :
+				closest_timer(&adap->sge, us);
+
+	if ((us | cnt) == 0)
+		q->intr_params = V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX);
+	else
+		q->intr_params = V_QINTR_TIMER_IDX(timer_val) |
+				 V_QINTR_CNT_EN(cnt > 0);
+	return 0;
+}
+
+static inline bool is_x_1g_port(const struct link_config *lc)
+{
+	return ((lc->supported & FW_PORT_CAP_SPEED_1G) != 0);
+}
+
+static inline bool is_x_10g_port(const struct link_config *lc)
+{
+	return ((lc->supported & FW_PORT_CAP_SPEED_10G) != 0 ||
+		(lc->supported & FW_PORT_CAP_SPEED_40G) != 0 ||
+		(lc->supported & FW_PORT_CAP_SPEED_100G) != 0);
+}
+
+inline void init_rspq(struct adapter *adap, struct sge_rspq *q,
+		      unsigned int us, unsigned int cnt,
+		      unsigned int size, unsigned int iqe_size)
+{
+	q->adapter = adap;
+	cxgb4_set_rspq_intr_params(q, us, cnt);
+	q->iqe_len = iqe_size;
+	q->size = size;
+}
+
+int cfg_queue_count(struct rte_eth_dev *eth_dev)
+{
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adap = pi->adapter;
+	struct sge *s = &adap->sge;
+	unsigned int max_queues = s->max_ethqsets / adap->params.nports;
+
+	if ((eth_dev->data->nb_rx_queues < 1) ||
+	    (eth_dev->data->nb_tx_queues < 1))
+		return -EINVAL;
+
+	if ((eth_dev->data->nb_rx_queues > max_queues) ||
+	    (eth_dev->data->nb_tx_queues > max_queues))
+		return -EINVAL;
+
+	if (eth_dev->data->nb_rx_queues > pi->rss_size)
+		return -EINVAL;
+
+	/* We must configure RSS, since config has changed*/
+	pi->flags &= ~PORT_RSS_DONE;
+
+	pi->n_rx_qsets = eth_dev->data->nb_rx_queues;
+	pi->n_tx_qsets = eth_dev->data->nb_tx_queues;
+
+	return 0;
+}
+
+void cfg_queues(struct rte_eth_dev *eth_dev)
+{
+	struct rte_config *config = rte_eal_get_configuration();
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	struct adapter *adap = pi->adapter;
+	struct sge *s = &adap->sge;
+	unsigned int i, nb_ports = 0, qidx = 0;
+	unsigned int q_per_port = 0;
+
+	if (!(adap->flags & CFG_QUEUES)) {
+		for_each_port(adap, i) {
+			struct port_info *tpi = adap2pinfo(adap, i);
+
+			nb_ports += (is_x_10g_port(&tpi->link_cfg)) ||
+				     is_x_1g_port(&tpi->link_cfg) ? 1 : 0;
+		}
+
+		/*
+		 * We default up to # of cores queues per 1G/10G port.
+		 */
+		if (nb_ports)
+			q_per_port = (MAX_ETH_QSETS -
+				     (adap->params.nports - nb_ports)) /
+				     nb_ports;
+
+		if (q_per_port > config->lcore_count)
+			q_per_port = config->lcore_count;
+
+		for_each_port(adap, i) {
+			struct port_info *pi = adap2pinfo(adap, i);
+
+			pi->first_qset = qidx;
+
+			/* Initially n_rx_qsets == n_tx_qsets */
+			pi->n_rx_qsets = (is_x_10g_port(&pi->link_cfg) ||
+					  is_x_1g_port(&pi->link_cfg)) ?
+					  q_per_port : 1;
+			pi->n_tx_qsets = pi->n_rx_qsets;
+
+			if (pi->n_rx_qsets > pi->rss_size)
+				pi->n_rx_qsets = pi->rss_size;
+
+			qidx += pi->n_rx_qsets;
+		}
+
+		s->max_ethqsets = qidx;
+
+		for (i = 0; i < ARRAY_SIZE(s->ethrxq); i++) {
+			struct sge_eth_rxq *r = &s->ethrxq[i];
+
+			init_rspq(adap, &r->rspq, 0, 0, 1024, 64);
+			r->usembufs = 1;
+			r->fl.size = (r->usembufs ? 1024 : 72);
+		}
+
+		for (i = 0; i < ARRAY_SIZE(s->ethtxq); i++)
+			s->ethtxq[i].q.size = 1024;
+
+		init_rspq(adap, &adap->sge.fw_evtq, 0, 0, 1024, 64);
+		adap->flags |= CFG_QUEUES;
+	}
+}
+
+void cxgbe_stats_get(struct port_info *pi, struct port_stats *stats)
+{
+	t4_get_port_stats_offset(pi->adapter, pi->tx_chan, stats,
+				 &pi->stats_base);
+}
+
+void cxgbe_stats_reset(struct port_info *pi)
+{
+	t4_clr_port_stats(pi->adapter, pi->tx_chan);
+}
+
+static void setup_memwin(struct adapter *adap)
+{
+	u32 mem_win0_base;
+
+	/* For T5, only relative offset inside the PCIe BAR is passed */
+	mem_win0_base = MEMWIN0_BASE;
+
+	/*
+	 * Set up memory window for accessing adapter memory ranges.  (Read
+	 * back MA register to ensure that changes propagate before we attempt
+	 * to use the new values.)
+	 */
+	t4_write_reg(adap,
+		     PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN,
+					 MEMWIN_NIC),
+		     mem_win0_base | V_BIR(0) |
+		     V_WINDOW(ilog2(MEMWIN0_APERTURE) - X_WINDOW_SHIFT));
+	t4_read_reg(adap,
+		    PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN,
+					MEMWIN_NIC));
+}
+
+static int init_rss(struct adapter *adap)
+{
+	unsigned int i;
+	int err;
+
+	err = t4_init_rss_mode(adap, adap->mbox);
+	if (err)
+		return err;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->rss = rte_zmalloc(NULL, pi->rss_size, 0);
+		if (!pi->rss)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static void print_port_info(struct adapter *adap)
+{
+	int i;
+	char buf[80];
+	struct rte_pci_addr *loc = &adap->pdev->addr;
+
+	for_each_port(adap, i) {
+		const struct port_info *pi = &adap->port[i];
+		char *bufp = buf;
+
+		if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100M)
+			bufp += sprintf(bufp, "100/");
+		if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_1G)
+			bufp += sprintf(bufp, "1000/");
+		if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G)
+			bufp += sprintf(bufp, "10G/");
+		if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G)
+			bufp += sprintf(bufp, "40G/");
+		if (bufp != buf)
+			--bufp;
+		sprintf(bufp, "BASE-%s",
+			t4_get_port_type_description(pi->port_type));
+
+		dev_info(adap,
+			 " " PCI_PRI_FMT " Chelsio rev %d %s %s\n",
+			 loc->domain, loc->bus, loc->devid, loc->function,
+			 CHELSIO_CHIP_RELEASE(adap->params.chip), buf,
+			 (adap->flags & USING_MSIX) ? " MSI-X" :
+			 (adap->flags & USING_MSI) ? " MSI" : "");
+	}
+}
+
+/*
+ * Tweak configuration based on module parameters, etc.  Most of these have
+ * defaults assigned to them by Firmware Configuration Files (if we're using
+ * them) but need to be explicitly set if we're using hard-coded
+ * initialization.  But even in the case of using Firmware Configuration
+ * Files, we'd like to expose the ability to change these via module
+ * parameters so these are essentially common tweaks/settings for
+ * Configuration Files and hard-coded initialization ...
+ */
+static int adap_init0_tweaks(struct adapter *adapter)
+{
+	u8 rx_dma_offset;
+
+	/*
+	 * Fix up various Host-Dependent Parameters like Page Size, Cache
+	 * Line Size, etc.  The firmware default is for a 4KB Page Size and
+	 * 64B Cache Line Size ...
+	 */
+	t4_fixup_host_params_compat(adapter, PAGE_SIZE, L1_CACHE_BYTES,
+				    T5_LAST_REV);
+
+	/*
+	 * Normally we tell the chip to deliver Ingress Packets into our DMA
+	 * buffers offset by 2 bytes in order to have the IP headers line up on
+	 * 4-byte boundaries.  This is a requirement for many architectures
+	 * which will throw a machine check fault if an attempt is made to
+	 * access one of the 4-byte IP header fields on a non-4-byte boundary.
+	 * And it's a major performance issue even on some architectures whichi
+	 * allow it like some implementations of the x86 ISA.  However, some
+	 * architectures don't mind this and for some very edge-case performance
+	 * sensitive applications (like forwarding large volumes of small
+	 * packets), setting this DMA offset to 0 will decrease the number of
+	 * PCI-E Bus transfers enough to measurably affect performance.
+	 */
+	rx_dma_offset = 2;
+	t4_set_reg_field(adapter, A_SGE_CONTROL, V_PKTSHIFT(M_PKTSHIFT),
+			 V_PKTSHIFT(rx_dma_offset));
+
+	/*
+	 * Don't include the "IP Pseudo Header" in CPL_RX_PKT checksums: Linux
+	 * adds the pseudo header itself.
+	 */
+	t4_tp_wr_bits_indirect(adapter, A_TP_INGRESS_CONFIG,
+			       F_CSUM_HAS_PSEUDO_HDR, 0);
+
+	return 0;
+}
+
+/*
+ * Attempt to initialize the adapter via a Firmware Configuration File.
+ */
+static int adap_init0_config(struct adapter *adapter, int reset)
+{
+	struct fw_caps_config_cmd caps_cmd;
+	unsigned long mtype = 0, maddr = 0;
+	u32 finiver, finicsum, cfcsum;
+	int ret;
+	int config_issued = 0;
+	int cfg_addr;
+	char config_name[20];
+
+	/*
+	 * Reset device if necessary.
+	 */
+	if (reset) {
+		ret = t4_fw_reset(adapter, adapter->mbox,
+				  F_PIORSTMODE | F_PIORST);
+		if (ret < 0) {
+			dev_warn(adapter, "Firmware reset failed, error %d\n",
+				 -ret);
+			goto bye;
+		}
+	}
+
+	cfg_addr = t4_flash_cfg_addr(adapter);
+	if (cfg_addr < 0) {
+		ret = cfg_addr;
+		dev_warn(adapter, "Finding address for firmware config file in flash failed, error %d\n",
+			 -ret);
+		goto bye;
+	}
+
+	strcpy(config_name, "On Flash");
+	mtype = FW_MEMTYPE_CF_FLASH;
+	maddr = cfg_addr;
+
+	/*
+	 * Issue a Capability Configuration command to the firmware to get it
+	 * to parse the Configuration File.  We don't use t4_fw_config_file()
+	 * because we want the ability to modify various features after we've
+	 * processed the configuration file ...
+	 */
+	memset(&caps_cmd, 0, sizeof(caps_cmd));
+	caps_cmd.op_to_write = cpu_to_be32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+					   F_FW_CMD_REQUEST | F_FW_CMD_READ);
+	caps_cmd.cfvalid_to_len16 =
+		cpu_to_be32(F_FW_CAPS_CONFIG_CMD_CFVALID |
+			    V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
+			    V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) |
+			    FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 &caps_cmd);
+	/*
+	 * If the CAPS_CONFIG failed with an ENOENT (for a Firmware
+	 * Configuration File in FLASH), our last gasp effort is to use the
+	 * Firmware Configuration File which is embedded in the firmware.  A
+	 * very few early versions of the firmware didn't have one embedded
+	 * but we can ignore those.
+	 */
+	if (ret == -ENOENT) {
+		dev_info(adapter, "%s: Going for embedded config in firmware..\n",
+			 __func__);
+
+		memset(&caps_cmd, 0, sizeof(caps_cmd));
+		caps_cmd.op_to_write =
+			cpu_to_be32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+				    F_FW_CMD_REQUEST | F_FW_CMD_READ);
+		caps_cmd.cfvalid_to_len16 = cpu_to_be32(FW_LEN16(caps_cmd));
+		ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd,
+				 sizeof(caps_cmd), &caps_cmd);
+		strcpy(config_name, "Firmware Default");
+	}
+
+	config_issued = 1;
+	if (ret < 0)
+		goto bye;
+
+	finiver = be32_to_cpu(caps_cmd.finiver);
+	finicsum = be32_to_cpu(caps_cmd.finicsum);
+	cfcsum = be32_to_cpu(caps_cmd.cfcsum);
+	if (finicsum != cfcsum)
+		dev_warn(adapter, "Configuration File checksum mismatch: [fini] csum=%#x, computed csum=%#x\n",
+			 finicsum, cfcsum);
+
+	/*
+	 * If we're a pure NIC driver then disable all offloading facilities.
+	 * This will allow the firmware to optimize aspects of the hardware
+	 * configuration which will result in improved performance.
+	 */
+	caps_cmd.niccaps &= cpu_to_be16(~(FW_CAPS_CONFIG_NIC_HASHFILTER |
+					  FW_CAPS_CONFIG_NIC_ETHOFLD));
+	caps_cmd.toecaps = 0;
+	caps_cmd.iscsicaps = 0;
+	caps_cmd.rdmacaps = 0;
+	caps_cmd.fcoecaps = 0;
+
+	/*
+	 * And now tell the firmware to use the configuration we just loaded.
+	 */
+	caps_cmd.op_to_write = cpu_to_be32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+					   F_FW_CMD_REQUEST | F_FW_CMD_WRITE);
+	caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 NULL);
+	if (ret < 0) {
+		dev_warn(adapter, "Unable to finalize Firmware Capabilities %d\n",
+			 -ret);
+		goto bye;
+	}
+
+	/*
+	 * Tweak configuration based on system architecture, module
+	 * parameters, etc.
+	 */
+	ret = adap_init0_tweaks(adapter);
+	if (ret < 0) {
+		dev_warn(adapter, "Unable to do init0-tweaks %d\n", -ret);
+		goto bye;
+	}
+
+	/*
+	 * And finally tell the firmware to initialize itself using the
+	 * parameters from the Configuration File.
+	 */
+	ret = t4_fw_initialize(adapter, adapter->mbox);
+	if (ret < 0) {
+		dev_warn(adapter, "Initializing Firmware failed, error %d\n",
+			 -ret);
+		goto bye;
+	}
+
+	/*
+	 * Return successfully and note that we're operating with parameters
+	 * not supplied by the driver, rather than from hard-wired
+	 * initialization constants burried in the driver.
+	 */
+	dev_info(adapter,
+		 "Successfully configured using Firmware Configuration File \"%s\", version %#x, computed checksum %#x\n",
+		 config_name, finiver, cfcsum);
+
+	return 0;
+
+	/*
+	 * Something bad happened.  Return the error ...  (If the "error"
+	 * is that there's no Configuration File on the adapter we don't
+	 * want to issue a warning since this is fairly common.)
+	 */
+bye:
+	if (config_issued && ret != -ENOENT)
+		dev_warn(adapter, "\"%s\" configuration file error %d\n",
+			 config_name, -ret);
+
+	dev_debug(adapter, "%s: returning ret = %d ..\n", __func__, ret);
+	return ret;
+}
+
+static int adap_init0(struct adapter *adap)
+{
+	int ret = 0;
+	u32 v, port_vec;
+	enum dev_state state;
+	u32 params[7], val[7];
+	int reset = 1;
+	int mbox = adap->mbox;
+
+	/*
+	 * Contact FW, advertising Master capability (and potentially forcing
+	 * ourselves as the Master PF if our module parameter force_init is
+	 * set).
+	 */
+	ret = t4_fw_hello(adap, adap->mbox, adap->mbox, MASTER_MAY, &state);
+	if (ret < 0) {
+		dev_err(adap, "%s: could not connect to FW, error %d\n",
+			__func__, -ret);
+		goto bye;
+	}
+
+	CXGBE_DEBUG_MBOX(adap, "%s: adap->mbox = %d; ret = %d\n", __func__,
+			 adap->mbox, ret);
+
+	if (ret == mbox)
+		adap->flags |= MASTER_PF;
+
+	if (state == DEV_STATE_INIT) {
+		/*
+		 * Force halt and reset FW because a previous instance may have
+		 * exited abnormally without properly shutting down
+		 */
+		ret = t4_fw_halt(adap, adap->mbox, reset);
+		if (ret < 0) {
+			dev_err(adap, "Failed to halt. Exit.\n");
+			goto bye;
+		}
+
+		ret = t4_fw_restart(adap, adap->mbox, reset);
+		if (ret < 0) {
+			dev_err(adap, "Failed to restart. Exit.\n");
+			goto bye;
+		}
+		state &= ~DEV_STATE_INIT;
+	}
+
+	t4_get_fw_version(adap, &adap->params.fw_vers);
+	t4_get_tp_version(adap, &adap->params.tp_vers);
+
+	dev_info(adap, "fw: %u.%u.%u.%u, TP: %u.%u.%u.%u\n",
+		 G_FW_HDR_FW_VER_MAJOR(adap->params.fw_vers),
+		 G_FW_HDR_FW_VER_MINOR(adap->params.fw_vers),
+		 G_FW_HDR_FW_VER_MICRO(adap->params.fw_vers),
+		 G_FW_HDR_FW_VER_BUILD(adap->params.fw_vers),
+		 G_FW_HDR_FW_VER_MAJOR(adap->params.tp_vers),
+		 G_FW_HDR_FW_VER_MINOR(adap->params.tp_vers),
+		 G_FW_HDR_FW_VER_MICRO(adap->params.tp_vers),
+		 G_FW_HDR_FW_VER_BUILD(adap->params.tp_vers));
+
+	ret = t4_get_core_clock(adap, &adap->params.vpd);
+	if (ret < 0) {
+		dev_err(adap, "%s: could not get core clock, error %d\n",
+			__func__, -ret);
+		goto bye;
+	}
+
+	/*
+	 * Find out what ports are available to us.  Note that we need to do
+	 * this before calling adap_init0_no_config() since it needs nports
+	 * and portvec ...
+	 */
+	v = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
+	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_PORTVEC);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1, &v, &port_vec);
+	if (ret < 0) {
+		dev_err(adap, "%s: failure in t4_queury_params; error = %d\n",
+			__func__, ret);
+		goto bye;
+	}
+
+	adap->params.nports = hweight32(port_vec);
+	adap->params.portvec = port_vec;
+
+	dev_debug(adap, "%s: adap->params.nports = %u\n", __func__,
+		  adap->params.nports);
+
+	/*
+	 * If the firmware is initialized already (and we're not forcing a
+	 * master initialization), note that we're living with existing
+	 * adapter parameters.  Otherwise, it's time to try initializing the
+	 * adapter ...
+	 */
+	if (state == DEV_STATE_INIT) {
+		dev_info(adap, "Coming up as %s: Adapter already initialized\n",
+			 adap->flags & MASTER_PF ? "MASTER" : "SLAVE");
+	} else {
+		dev_info(adap, "Coming up as MASTER: Initializing adapter\n");
+
+		ret = adap_init0_config(adap, reset);
+		if (ret == -ENOENT) {
+			dev_err(adap,
+				"No Configuration File present on adapter. Using hard-wired configuration parameters.\n");
+			goto bye;
+		}
+	}
+	if (ret < 0) {
+		dev_err(adap, "could not initialize adapter, error %d\n", -ret);
+		goto bye;
+	}
+
+	/*
+	 * Give the SGE code a chance to pull in anything that it needs ...
+	 * Note that this must be called after we retrieve our VPD parameters
+	 * in order to know how to convert core ticks to seconds, etc.
+	 */
+	ret = t4_sge_init(adap);
+	if (ret < 0) {
+		dev_err(adap, "t4_sge_init failed with error %d\n",
+			-ret);
+		goto bye;
+	}
+
+	/*
+	 * Grab some of our basic fundamental operating parameters.
+	 */
+#define FW_PARAM_DEV(param) \
+	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \
+	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param))
+
+#define FW_PARAM_PFVF(param) \
+	(V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \
+	 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param) |  \
+	 V_FW_PARAMS_PARAM_Y(0) | \
+	 V_FW_PARAMS_PARAM_Z(0))
+
+	/* If we're running on newer firmware, let it know that we're
+	 * prepared to deal with encapsulated CPL messages.  Older
+	 * firmware won't understand this and we'll just get
+	 * unencapsulated messages ...
+	 */
+	params[0] = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
+	val[0] = 1;
+	(void)t4_set_params(adap, adap->mbox, adap->pf, 0, 1, params, val);
+
+	/*
+	 * Find out whether we're allowed to use the T5+ ULPTX MEMWRITE DSGL
+	 * capability.  Earlier versions of the firmware didn't have the
+	 * ULPTX_MEMWRITE_DSGL so we'll interpret a query failure as no
+	 * permission to use ULPTX MEMWRITE DSGL.
+	 */
+	if (is_t4(adap->params.chip)) {
+		adap->params.ulptx_memwrite_dsgl = false;
+	} else {
+		params[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+				      1, params, val);
+		adap->params.ulptx_memwrite_dsgl = (ret == 0 && val[0] != 0);
+	}
+
+	/*
+	 * The MTU/MSS Table is initialized by now, so load their values.  If
+	 * we're initializing the adapter, then we'll make any modifications
+	 * we want to the MTU/MSS Table and also initialize the congestion
+	 * parameters.
+	 */
+	t4_read_mtu_tbl(adap, adap->params.mtus, NULL);
+	if (state != DEV_STATE_INIT) {
+		int i;
+
+		/*
+		 * The default MTU Table contains values 1492 and 1500.
+		 * However, for TCP, it's better to have two values which are
+		 * a multiple of 8 +/- 4 bytes apart near this popular MTU.
+		 * This allows us to have a TCP Data Payload which is a
+		 * multiple of 8 regardless of what combination of TCP Options
+		 * are in use (always a multiple of 4 bytes) which is
+		 * important for performance reasons.  For instance, if no
+		 * options are in use, then we have a 20-byte IP header and a
+		 * 20-byte TCP header.  In this case, a 1500-byte MSS would
+		 * result in a TCP Data Payload of 1500 - 40 == 1460 bytes
+		 * which is not a multiple of 8.  So using an MSS of 1488 in
+		 * this case results in a TCP Data Payload of 1448 bytes which
+		 * is a multiple of 8.  On the other hand, if 12-byte TCP Time
+		 * Stamps have been negotiated, then an MTU of 1500 bytes
+		 * results in a TCP Data Payload of 1448 bytes which, as
+		 * above, is a multiple of 8 bytes ...
+		 */
+		for (i = 0; i < NMTUS; i++)
+			if (adap->params.mtus[i] == 1492) {
+				adap->params.mtus[i] = 1488;
+				break;
+			}
+
+		t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd,
+			     adap->params.b_wnd);
+	}
+	t4_init_sge_params(adap);
+	t4_init_tp_params(adap);
+
+	adap->params.drv_memwin = MEMWIN_NIC;
+	adap->flags |= FW_OK;
+	dev_debug(adap, "%s: returning zero..\n", __func__);
+	return 0;
+
+	/*
+	 * Something bad happened.  If a command timed out or failed with EIO
+	 * FW does not operate within its spec or something catastrophic
+	 * happened to HW/FW, stop issuing commands.
+	 */
+bye:
+	if (ret != -ETIMEDOUT && ret != -EIO)
+		t4_fw_bye(adap, adap->mbox);
+	return ret;
+}
+
+/**
+ * t4_os_portmod_changed - handle port module changes
+ * @adap: the adapter associated with the module change
+ * @port_id: the port index whose module status has changed
+ *
+ * This is the OS-dependent handler for port module changes.  It is
+ * invoked when a port module is removed or inserted for any OS-specific
+ * processing.
+ */
+void t4_os_portmod_changed(const struct adapter *adap, int port_id)
+{
+	static const char * const mod_str[] = {
+		NULL, "LR", "SR", "ER", "passive DA", "active DA", "LRM"
+	};
+
+	const struct port_info *pi = &adap->port[port_id];
+
+	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
+		dev_info(adap, "Port%d: port module unplugged\n", pi->port_id);
+	else if (pi->mod_type < ARRAY_SIZE(mod_str))
+		dev_info(adap, "Port%d: %s port module inserted\n", pi->port_id,
+			 mod_str[pi->mod_type]);
+	else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED)
+		dev_info(adap, "Port%d: unsupported optical port module inserted\n",
+			 pi->port_id);
+	else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN)
+		dev_info(adap, "Port%d: unknown port module inserted, forcing TWINAX\n",
+			 pi->port_id);
+	else if (pi->mod_type == FW_PORT_MOD_TYPE_ERROR)
+		dev_info(adap, "Port%d: transceiver module error\n",
+			 pi->port_id);
+	else
+		dev_info(adap, "Port%d: unknown module type %d inserted\n",
+			 pi->port_id, pi->mod_type);
+}
+
+/**
+ * link_start - enable a port
+ * @dev: the port to enable
+ *
+ * Performs the MAC and PHY actions needed to enable a port.
+ */
+int link_start(struct port_info *pi)
+{
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	/*
+	 * We do not set address filters and promiscuity here, the stack does
+	 * that step explicitly.
+	 */
+	ret = t4_set_rxmode(adapter, adapter->mbox, pi->viid, 1500, -1, -1,
+			    -1, 1, true);
+	if (ret == 0) {
+		ret = t4_change_mac(adapter, adapter->mbox, pi->viid,
+				    pi->xact_addr_filt,
+				    (u8 *)&pi->eth_dev->data->mac_addrs[0],
+				    true, true);
+		if (ret >= 0) {
+			pi->xact_addr_filt = ret;
+			ret = 0;
+		}
+	}
+	if (ret == 0)
+		ret = t4_link_l1cfg(adapter, adapter->mbox, pi->tx_chan,
+				    &pi->link_cfg);
+	if (ret == 0) {
+		/*
+		 * Enabling a Virtual Interface can result in an interrupt
+		 * during the processing of the VI Enable command and, in some
+		 * paths, result in an attempt to issue another command in the
+		 * interrupt context.  Thus, we disable interrupts during the
+		 * course of the VI Enable command ...
+		 */
+		ret = t4_enable_vi_params(adapter, adapter->mbox, pi->viid,
+					  true, true, false);
+	}
+	return ret;
+}
+
+/**
+ * cxgb4_write_rss - write the RSS table for a given port
+ * @pi: the port
+ * @queues: array of queue indices for RSS
+ *
+ * Sets up the portion of the HW RSS table for the port's VI to distribute
+ * packets to the Rx queues in @queues.
+ */
+int cxgb4_write_rss(const struct port_info *pi, const u16 *queues)
+{
+	u16 *rss;
+	int i, err;
+	struct adapter *adapter = pi->adapter;
+	const struct sge_eth_rxq *rxq;
+
+	/*  Should never be called before setting up sge eth rx queues */
+	BUG_ON(!(adapter->flags & FULL_INIT_DONE));
+
+	rxq = &adapter->sge.ethrxq[pi->first_qset];
+	rss = rte_zmalloc(NULL, pi->rss_size * sizeof(u16), 0);
+	if (!rss)
+		return -ENOMEM;
+
+	/* map the queue indices to queue ids */
+	for (i = 0; i < pi->rss_size; i++, queues++)
+		rss[i] = rxq[*queues].rspq.abs_id;
+
+	err = t4_config_rss_range(adapter, adapter->pf, pi->viid, 0,
+				  pi->rss_size, rss, pi->rss_size);
+	/*
+	 * If Tunnel All Lookup isn't specified in the global RSS
+	 * Configuration, then we need to specify a default Ingress
+	 * Queue for any ingress packets which aren't hashed.  We'll
+	 * use our first ingress queue ...
+	 */
+	if (!err)
+		err = t4_config_vi_rss(adapter, adapter->mbox, pi->viid,
+				       F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN |
+				       F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN |
+				       F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN |
+				       F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN |
+				       F_FW_RSS_VI_CONFIG_CMD_UDPEN,
+				       rss[0]);
+	rte_free(rss);
+	return err;
+}
+
+/**
+ * setup_rss - configure RSS
+ * @adapter: the adapter
+ *
+ * Sets up RSS to distribute packets to multiple receive queues.  We
+ * configure the RSS CPU lookup table to distribute to the number of HW
+ * receive queues, and the response queue lookup table to narrow that
+ * down to the response queues actually configured for each port.
+ * We always configure the RSS mapping for all ports since the mapping
+ * table has plenty of entries.
+ */
+int setup_rss(struct port_info *pi)
+{
+	int j, err;
+	struct adapter *adapter = pi->adapter;
+
+	dev_debug(adapter, "%s:  pi->rss_size = %u; pi->n_rx_qsets = %u\n",
+		  __func__, pi->rss_size, pi->n_rx_qsets);
+
+	if (!pi->flags & PORT_RSS_DONE) {
+		if (adapter->flags & FULL_INIT_DONE) {
+			/* Fill default values with equal distribution */
+			for (j = 0; j < pi->rss_size; j++)
+				pi->rss[j] = j % pi->n_rx_qsets;
+
+			err = cxgb4_write_rss(pi, pi->rss);
+			if (err)
+				return err;
+			pi->flags |= PORT_RSS_DONE;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Enable NAPI scheduling and interrupt generation for all Rx queues.
+ */
+static void enable_rx(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	struct sge_rspq *q = &s->fw_evtq;
+	int i, j;
+
+	/* 0-increment GTS to start the timer and enable interrupts */
+	t4_write_reg(adap, MYPF_REG(A_SGE_PF_GTS),
+		     V_SEINTARM(q->intr_params) |
+		     V_INGRESSQID(q->cntxt_id));
+
+	for_each_port(adap, i) {
+		const struct port_info *pi = &adap->port[i];
+		struct rte_eth_dev *eth_dev = pi->eth_dev;
+
+		for (j = 0; j < eth_dev->data->nb_rx_queues; j++) {
+			q = eth_dev->data->rx_queues[j];
+
+			/*
+			 * 0-increment GTS to start the timer and enable
+			 * interrupts
+			 */
+			t4_write_reg(adap, MYPF_REG(A_SGE_PF_GTS),
+				     V_SEINTARM(q->intr_params) |
+				     V_INGRESSQID(q->cntxt_id));
+		}
+	}
+}
+
+/**
+ * cxgb_up - enable the adapter
+ * @adap: adapter being enabled
+ *
+ * Called when the first port is enabled, this function performs the
+ * actions necessary to make an adapter operational, such as completing
+ * the initialization of HW modules, and enabling interrupts.
+ */
+int cxgbe_up(struct adapter *adap)
+{
+	enable_rx(adap);
+	t4_intr_enable(adap);
+	adap->flags |= FULL_INIT_DONE;
+
+	/* TODO: deadman watchdog ?? */
+	return 0;
+}
+
+/*
+ * Close the port
+ */
+int cxgbe_down(struct port_info *pi)
+{
+	struct adapter *adapter = pi->adapter;
+	int err = 0;
+
+	err = t4_enable_vi(adapter, adapter->mbox, pi->viid, false, false);
+	if (err) {
+		dev_err(adapter, "%s: disable_vi failed: %d\n", __func__, err);
+		return err;
+	}
+
+	t4_reset_link_config(adapter, pi->port_id);
+	return 0;
+}
+
+/*
+ * Release resources when all the ports have been stopped.
+ */
+void cxgbe_close(struct adapter *adapter)
+{
+	struct port_info *pi;
+	int i;
+
+	if (adapter->flags & FULL_INIT_DONE) {
+		t4_intr_disable(adapter);
+		t4_free_sge_resources(adapter);
+		for_each_port(adapter, i) {
+			pi = adap2pinfo(adapter, i);
+			if (pi->viid != 0)
+				t4_free_vi(adapter, adapter->mbox,
+					   adapter->pf, 0, pi->viid);
+			rte_free(pi->eth_dev->data->mac_addrs);
+		}
+		adapter->flags &= ~FULL_INIT_DONE;
+	}
+
+	if (adapter->flags & FW_OK)
+		t4_fw_bye(adapter, adapter->mbox);
+}
+
+int cxgbe_probe(struct adapter *adapter)
+{
+	struct port_info *pi;
+	int func, i;
+	int err = 0;
+
+	func = G_SOURCEPF(t4_read_reg(adapter, A_PL_WHOAMI));
+	adapter->mbox = func;
+	adapter->pf = func;
+
+	t4_os_lock_init(&adapter->mbox_lock);
+	TAILQ_INIT(&adapter->mbox_list);
+
+	err = t4_prep_adapter(adapter);
+	if (err)
+		return err;
+
+	setup_memwin(adapter);
+	err = adap_init0(adapter);
+	if (err) {
+		dev_err(adapter, "%s: Adapter initialization failed, error %d\n",
+			__func__, err);
+		goto out_free;
+	}
+
+	if (!is_t4(adapter->params.chip)) {
+		/*
+		 * The userspace doorbell BAR is split evenly into doorbell
+		 * regions, each associated with an egress queue.  If this
+		 * per-queue region is large enough (at least UDBS_SEG_SIZE)
+		 * then it can be used to submit a tx work request with an
+		 * implied doorbell.  Enable write combining on the BAR if
+		 * there is room for such work requests.
+		 */
+		int s_qpp, qpp, num_seg;
+
+		s_qpp = (S_QUEUESPERPAGEPF0 +
+			(S_QUEUESPERPAGEPF1 - S_QUEUESPERPAGEPF0) *
+			adapter->pf);
+		qpp = 1 << ((t4_read_reg(adapter,
+				A_SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp)
+				& M_QUEUESPERPAGEPF0);
+		num_seg = PAGE_SIZE / UDBS_SEG_SIZE;
+		if (qpp > num_seg)
+			dev_warn(adapter, "Incorrect SGE EGRESS QUEUES_PER_PAGE configuration, continuing in debug mode\n");
+
+		adapter->bar2 = (void *)adapter->pdev->mem_resource[2].addr;
+		if (!adapter->bar2) {
+			dev_err(adapter, "cannot map device bar2 region\n");
+			err = -ENOMEM;
+			goto out_free;
+		}
+		t4_write_reg(adapter, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) |
+			     V_STATMODE(0));
+	}
+
+	for_each_port(adapter, i) {
+		char name[RTE_ETH_NAME_MAX_LEN];
+		struct rte_eth_dev_data *data = NULL;
+		const unsigned int numa_node = rte_socket_id();
+
+		pi = &adapter->port[i];
+		pi->adapter = adapter;
+		pi->xact_addr_filt = -1;
+		pi->port_id = i;
+
+		snprintf(name, sizeof(name), "cxgbe%d",
+			 adapter->eth_dev->data->port_id + i);
+
+		if (i == 0) {
+			/* First port is already allocated by DPDK */
+			pi->eth_dev = adapter->eth_dev;
+			goto allocate_mac;
+		}
+
+		/*
+		 * now do all data allocation - for eth_dev structure,
+		 * and internal (private) data for the remaining ports
+		 */
+
+		/* reserve an ethdev entry */
+		pi->eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_PCI);
+		if (!pi->eth_dev)
+			goto out_free;
+
+		data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
+		if (!data)
+			goto out_free;
+
+		data->port_id = adapter->eth_dev->data->port_id + i;
+
+		pi->eth_dev->data = data;
+
+allocate_mac:
+		pi->eth_dev->pci_dev = adapter->pdev;
+		pi->eth_dev->data->dev_private = pi;
+		pi->eth_dev->driver = adapter->eth_dev->driver;
+		pi->eth_dev->dev_ops = adapter->eth_dev->dev_ops;
+		pi->eth_dev->tx_pkt_burst = adapter->eth_dev->tx_pkt_burst;
+		pi->eth_dev->rx_pkt_burst = adapter->eth_dev->rx_pkt_burst;
+		TAILQ_INIT(&pi->eth_dev->link_intr_cbs);
+
+		pi->eth_dev->data->mac_addrs = rte_zmalloc(name,
+							   ETHER_ADDR_LEN, 0);
+		if (!pi->eth_dev->data->mac_addrs) {
+			dev_err(adapter, "%s: Mem allocation failed for storing mac addr, aborting\n",
+				__func__);
+			err = -1;
+			goto out_free;
+		}
+	}
+
+	if (adapter->flags & FW_OK) {
+		err = t4_port_init(adapter, adapter->mbox, adapter->pf, 0);
+		if (err) {
+			dev_err(adapter, "%s: t4_port_init failed with err %d\n",
+				__func__, err);
+			goto out_free;
+		}
+	}
+
+	cfg_queues(adapter->eth_dev);
+
+	print_port_info(adapter);
+
+	err = init_rss(adapter);
+	if (err)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+		if (pi->viid != 0)
+			t4_free_vi(adapter, adapter->mbox, adapter->pf,
+				   0, pi->viid);
+		/* Skip first port since it'll be de-allocated by DPDK */
+		if (i == 0)
+			continue;
+		if (pi->eth_dev->data)
+			rte_free(pi->eth_dev->data);
+	}
+
+	if (adapter->flags & FW_OK)
+		t4_fw_bye(adapter, adapter->mbox);
+	return -err;
+}
diff --git a/lib/librte_pmd_cxgbe/sge.c b/lib/librte_pmd_cxgbe/sge.c
new file mode 100644
index 0000000..6343dff
--- /dev/null
+++ b/lib/librte_pmd_cxgbe/sge.c
@@ -0,0 +1,2250 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2014-2015 Chelsio Communications.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Chelsio Communications nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/if_ether.h>
+#include <sys/queue.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_interrupts.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_pci.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_alarm.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_atomic.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+#include <rte_dev.h>
+
+#include "common.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "cxgbe.h"
+
+static inline void ship_tx_pkt_coalesce_wr(struct adapter *adap,
+					   struct sge_eth_txq *txq,
+					   uint16_t left);
+
+/*
+ * Rx buffer size for "packed" pages Free List buffers (multiple ingress
+ * packets packed per page buffer).  We use largish buffers if possible but
+ * settle for single pages under memory shortage.
+ */
+#if PAGE_SHIFT >= 16
+#define FL_PG_ORDER 0
+#else
+#define FL_PG_ORDER (16 - PAGE_SHIFT)
+#endif
+
+/* RX_PULL_LEN should be <= RX_COPY_THRES */
+#define RX_COPY_THRES    256
+#define RX_PULL_LEN      128
+
+/*
+ * Max number of Rx buffers we replenish at a time.
+ */
+#define MAX_RX_REFILL 16U
+
+#define NOMEM_TMR_IDX (SGE_NTIMERS - 1)
+
+/*
+ * Max Tx descriptor space we allow for an Ethernet packet to be inlined
+ * into a WR.
+ */
+#define MAX_IMM_TX_PKT_LEN 256
+
+/*
+ * Rx buffer sizes for "usembufs" Free List buffers (one ingress packet
+ * per mbuf buffer).  We currently only support two sizes for 1500- and
+ * 9000-byte MTUs. We could easily support more but there doesn't seem to be
+ * much need for that ...
+ */
+#define FL_MTU_SMALL 1500
+#define FL_MTU_LARGE 9000
+
+static inline unsigned int fl_mtu_bufsize(struct adapter *adapter,
+					  unsigned int mtu)
+{
+	struct sge *s = &adapter->sge;
+
+	return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align);
+}
+
+#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL)
+#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE)
+
+/*
+ * Bits 0..3 of rx_sw_desc.dma_addr have special meaning.  The hardware uses
+ * these to specify the buffer size as an index into the SGE Free List Buffer
+ * Size register array.  We also use bit 4, when the buffer has been unmapped
+ * for DMA, but this is of course never sent to the hardware and is only used
+ * to prevent double unmappings.  All of the above requires that the Free List
+ * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are
+ * 32-byte or or a power of 2 greater in alignment.  Since the SGE's minimal
+ * Free List Buffer alignment is 32 bytes, this works out for us ...
+ */
+enum {
+	RX_BUF_FLAGS     = 0x1f,   /* bottom five bits are special */
+	RX_BUF_SIZE      = 0x0f,   /* bottom three bits are for buf sizes */
+	RX_UNMAPPED_BUF  = 0x10,   /* buffer is not mapped */
+
+	/*
+	 * XXX We shouldn't depend on being able to use these indices.
+	 * XXX Especially when some other Master PF has initialized the
+	 * XXX adapter or we use the Firmware Configuration File.  We
+	 * XXX should really search through the Host Buffer Size register
+	 * XXX array for the appropriately sized buffer indices.
+	 */
+	RX_SMALL_PG_BUF  = 0x0,   /* small (PAGE_SIZE) page buffer */
+	RX_LARGE_PG_BUF  = 0x1,   /* buffer large (FL_PG_ORDER) page buffer */
+
+	RX_SMALL_MTU_BUF = 0x2,   /* small MTU buffer */
+	RX_LARGE_MTU_BUF = 0x3,   /* large MTU buffer */
+};
+
+/**
+ * txq_avail - return the number of available slots in a Tx queue
+ * @q: the Tx queue
+ *
+ * Returns the number of descriptors in a Tx queue available to write new
+ * packets.
+ */
+static inline unsigned int txq_avail(const struct sge_txq *q)
+{
+	return q->size - 1 - q->in_use;
+}
+
+static int map_mbuf(struct rte_mbuf *mbuf, dma_addr_t *addr)
+{
+	struct rte_mbuf *m = mbuf;
+
+	for (; m; m = m->next, addr++) {
+		*addr = m->buf_physaddr + rte_pktmbuf_headroom(m);
+		if (*addr == 0)
+			goto out_err;
+	}
+	return 0;
+
+out_err:
+	return -ENOMEM;
+}
+
+/**
+ * free_tx_desc - reclaims Tx descriptors and their buffers
+ * @q: the Tx queue to reclaim descriptors from
+ * @n: the number of descriptors to reclaim
+ *
+ * Reclaims Tx descriptors from an SGE Tx queue and frees the associated
+ * Tx buffers.  Called with the Tx queue lock held.
+ */
+static void free_tx_desc(struct sge_txq *q, unsigned int n)
+{
+	struct tx_sw_desc *d;
+	unsigned int cidx = 0;
+
+	d = &q->sdesc[cidx];
+	while (n--) {
+		if (d->mbuf) {                       /* an SGL is present */
+			rte_pktmbuf_free(d->mbuf);
+			d->mbuf = NULL;
+		}
+		if (d->coalesce.idx) {
+			int i;
+
+			for (i = 0; i < d->coalesce.idx; i++) {
+				rte_pktmbuf_free(d->coalesce.mbuf[i]);
+				d->coalesce.mbuf[i] = NULL;
+			}
+			d->coalesce.idx = 0;
+		}
+		++d;
+		if (++cidx == q->size) {
+			cidx = 0;
+			d = q->sdesc;
+		}
+		RTE_MBUF_PREFETCH_TO_FREE(&q->sdesc->mbuf->pool);
+	}
+}
+
+static void reclaim_tx_desc(struct sge_txq *q, unsigned int n)
+{
+	unsigned int cidx = q->cidx;
+
+	while (n--) {
+		if (++cidx == q->size)
+			cidx = 0;
+	}
+	q->cidx = cidx;
+}
+
+/**
+ * fl_cap - return the capacity of a free-buffer list
+ * @fl: the FL
+ *
+ * Returns the capacity of a free-buffer list.  The capacity is less than
+ * the size because one descriptor needs to be left unpopulated, otherwise
+ * HW will think the FL is empty.
+ */
+static inline unsigned int fl_cap(const struct sge_fl *fl)
+{
+	return fl->size - 8;   /* 1 descriptor = 8 buffers */
+}
+
+/**
+ * fl_starving - return whether a Free List is starving.
+ * @adapter: pointer to the adapter
+ * @fl: the Free List
+ *
+ * Tests specified Free List to see whether the number of buffers
+ * available to the hardware has falled below our "starvation"
+ * threshold.
+ */
+static inline bool fl_starving(const struct adapter *adapter,
+			       const struct sge_fl *fl)
+{
+	const struct sge *s = &adapter->sge;
+
+	return fl->avail - fl->pend_cred <= s->fl_starve_thres;
+}
+
+static inline unsigned int get_buf_size(struct adapter *adapter,
+					const struct rx_sw_desc *d)
+{
+	struct sge *s = &adapter->sge;
+	unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE;
+	unsigned int buf_size;
+
+	switch (rx_buf_size_idx) {
+	case RX_SMALL_PG_BUF:
+		buf_size = PAGE_SIZE;
+		break;
+
+	case RX_LARGE_PG_BUF:
+		buf_size = PAGE_SIZE << s->fl_pg_order;
+		break;
+
+	case RX_SMALL_MTU_BUF:
+		buf_size = FL_MTU_SMALL_BUFSIZE(adapter);
+		break;
+
+	case RX_LARGE_MTU_BUF:
+		buf_size = FL_MTU_LARGE_BUFSIZE(adapter);
+		break;
+
+	default:
+		BUG_ON(1);
+		buf_size = 0; /* deal with bogus compiler warnings */
+		/* NOTREACHED */
+	}
+
+	return buf_size;
+}
+
+/**
+ * free_rx_bufs - free the Rx buffers on an SGE free list
+ * @q: the SGE free list to free buffers from
+ * @n: how many buffers to free
+ *
+ * Release the next @n buffers on an SGE free-buffer Rx queue.   The
+ * buffers must be made inaccessible to HW before calling this function.
+ */
+static void free_rx_bufs(struct sge_fl *q, int n)
+{
+	unsigned int cidx = q->cidx;
+	struct rx_sw_desc *d;
+
+	d = &q->sdesc[cidx];
+	while (n--) {
+		if (d->buf) {
+			rte_pktmbuf_free(d->buf);
+			d->buf = NULL;
+		}
+		++d;
+		if (++cidx == q->size) {
+			cidx = 0;
+			d = q->sdesc;
+		}
+		q->avail--;
+	}
+	q->cidx = cidx;
+}
+
+/**
+ * unmap_rx_buf - unmap the current Rx buffer on an SGE free list
+ * @q: the SGE free list
+ *
+ * Unmap the current buffer on an SGE free-buffer Rx queue.   The
+ * buffer must be made inaccessible to HW before calling this function.
+ *
+ * This is similar to @free_rx_bufs above but does not free the buffer.
+ * Do note that the FL still loses any further access to the buffer.
+ */
+static void unmap_rx_buf(struct sge_fl *q)
+{
+	if (++q->cidx == q->size)
+		q->cidx = 0;
+	q->avail--;
+}
+
+static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
+{
+	if (q->pend_cred >= 8) {
+		u32 val = adap->params.arch.sge_fl_db;
+
+		if (is_t4(adap->params.chip))
+			val |= V_PIDX(q->pend_cred / 8);
+		else
+			val |= V_PIDX_T5(q->pend_cred / 8);
+
+		/*
+		 * Make sure all memory writes to the Free List queue are
+		 * committed before we tell the hardware about them.
+		 */
+		wmb();
+
+		/*
+		 * If we don't have access to the new User Doorbell (T5+), use
+		 * the old doorbell mechanism; otherwise use the new BAR2
+		 * mechanism.
+		 */
+		if (unlikely(!q->bar2_addr)) {
+			t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+				     val | V_QID(q->cntxt_id));
+		} else {
+			writel(val | V_QID(q->bar2_qid),
+			       (void *)((uintptr_t)q->bar2_addr +
+			       SGE_UDB_KDOORBELL));
+
+			/*
+			 * This Write memory Barrier will force the write to
+			 * the User Doorbell area to be flushed.
+			 */
+			wmb();
+		}
+		q->pend_cred &= 7;
+	}
+}
+
+static inline struct rte_mbuf *cxgbe_rxmbuf_alloc(struct rte_mempool *mp)
+{
+	struct rte_mbuf *m;
+
+	m = __rte_mbuf_raw_alloc(mp);
+	__rte_mbuf_sanity_check_raw(m, 0);
+	return m;
+}
+
+static inline void set_rx_sw_desc(struct rx_sw_desc *sd, void *buf,
+				  dma_addr_t mapping)
+{
+	sd->buf = buf;
+	sd->dma_addr = mapping;      /* includes size low bits */
+}
+
+/**
+ * refill_fl_usembufs - refill an SGE Rx buffer ring with mbufs
+ * @adap: the adapter
+ * @q: the ring to refill
+ * @n: the number of new buffers to allocate
+ *
+ * (Re)populate an SGE free-buffer queue with up to @n new packet buffers,
+ * allocated with the supplied gfp flags.  The caller must assure that
+ * @n does not exceed the queue's capacity.  If afterwards the queue is
+ * found critically low mark it as starving in the bitmap of starving FLs.
+ *
+ * Returns the number of buffers allocated.
+ */
+static unsigned int refill_fl_usembufs(struct adapter *adap, struct sge_fl *q,
+				       int n)
+{
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, fl);
+	unsigned int cred = q->avail;
+	__be64 *d = &q->desc[q->pidx];
+	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
+	unsigned int buf_size_idx = RX_SMALL_MTU_BUF;
+
+	while (n--) {
+		struct rte_mbuf *mbuf = cxgbe_rxmbuf_alloc(rxq->rspq.mb_pool);
+		dma_addr_t mapping;
+
+		if (!mbuf) {
+			dev_debug(adap, "%s: mbuf alloc failed\n", __func__);
+			q->alloc_failed++;
+			rxq->rspq.eth_dev->data->rx_mbuf_alloc_failed++;
+			goto out;
+		}
+
+		mbuf->data_off = RTE_PKTMBUF_HEADROOM;
+		mbuf->next = NULL;
+
+		mapping = (dma_addr_t)(mbuf->buf_physaddr + mbuf->data_off);
+
+		mapping |= buf_size_idx;
+		*d++ = cpu_to_be64(mapping);
+		set_rx_sw_desc(sd, mbuf, mapping);
+		sd++;
+
+		q->avail++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+	}
+
+out:    cred = q->avail - cred;
+	q->pend_cred += cred;
+	ring_fl_db(adap, q);
+
+	if (unlikely(fl_starving(adap, q))) {
+		/*
+		 * Make sure data has been written to free list
+		 */
+		wmb();
+		q->low++;
+	}
+
+	return cred;
+}
+
+/**
+ * refill_fl - refill an SGE Rx buffer ring with mbufs
+ * @adap: the adapter
+ * @q: the ring to refill
+ * @n: the number of new buffers to allocate
+ *
+ * (Re)populate an SGE free-buffer queue with up to @n new packet buffers,
+ * allocated with the supplied gfp flags.  The caller must assure that
+ * @n does not exceed the queue's capacity.  Returns the number of buffers
+ * allocated.
+ */
+static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n)
+{
+	return refill_fl_usembufs(adap, q, n);
+}
+
+static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
+{
+	refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail));
+}
+
+/*
+ * Return the number of reclaimable descriptors in a Tx queue.
+ */
+static inline int reclaimable(const struct sge_txq *q)
+{
+	int hw_cidx = ntohs(q->stat->cidx);
+
+	hw_cidx -= q->cidx;
+	if (hw_cidx < 0)
+		return hw_cidx + q->size;
+	return hw_cidx;
+}
+
+/**
+ * reclaim_completed_tx - reclaims completed Tx descriptors
+ * @q: the Tx queue to reclaim completed descriptors from
+ *
+ * Reclaims Tx descriptors that the SGE has indicated it has processed.
+ */
+void reclaim_completed_tx(struct sge_txq *q)
+{
+	unsigned int avail = reclaimable(q);
+
+	do {
+		/* reclaim as much as possible */
+		reclaim_tx_desc(q, avail);
+		q->in_use -= avail;
+		avail = reclaimable(q);
+	} while (avail);
+}
+
+/**
+ * sgl_len - calculates the size of an SGL of the given capacity
+ * @n: the number of SGL entries
+ *
+ * Calculates the number of flits needed for a scatter/gather list that
+ * can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	/*
+	 * A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA
+	 * addresses.  The DSGL Work Request starts off with a 32-bit DSGL
+	 * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N,
+	 * repeated sequences of { Length[i], Length[i+1], Address[i],
+	 * Address[i+1] } (this ensures that all addresses are on 64-bit
+	 * boundaries).  If N is even, then Length[N+1] should be set to 0 and
+	 * Address[N+1] is omitted.
+	 *
+	 * The following calculation incorporates all of the above.  It's
+	 * somewhat hard to follow but, briefly: the "+2" accounts for the
+	 * first two flits which include the DSGL header, Length0 and
+	 * Address0; the "(3*(n-1))/2" covers the main body of list entries (3
+	 * flits for every pair of the remaining N) +1 if (n-1) is odd; and
+	 * finally the "+((n-1)&1)" adds the one remaining flit needed if
+	 * (n-1) is odd ...
+	 */
+	n--;
+	return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/**
+ * flits_to_desc - returns the num of Tx descriptors for the given flits
+ * @n: the number of flits
+ *
+ * Returns the number of Tx descriptors needed for the supplied number
+ * of flits.
+ */
+static inline unsigned int flits_to_desc(unsigned int n)
+{
+	return DIV_ROUND_UP(n, 8);
+}
+
+/**
+ * is_eth_imm - can an Ethernet packet be sent as immediate data?
+ * @m: the packet
+ *
+ * Returns whether an Ethernet packet is small enough to fit as
+ * immediate data. Return value corresponds to the headroom required.
+ */
+static inline int is_eth_imm(const struct rte_mbuf *m)
+{
+	unsigned int hdrlen = (m->ol_flags & PKT_TX_TCP_SEG) ?
+			      sizeof(struct cpl_tx_pkt_lso_core) : 0;
+
+	hdrlen += sizeof(struct cpl_tx_pkt);
+	if (m->pkt_len <= MAX_IMM_TX_PKT_LEN - hdrlen)
+		return hdrlen;
+
+	return 0;
+}
+
+/**
+ * calc_tx_flits - calculate the number of flits for a packet Tx WR
+ * @m: the packet
+ *
+ * Returns the number of flits needed for a Tx WR for the given Ethernet
+ * packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int calc_tx_flits(const struct rte_mbuf *m)
+{
+	unsigned int flits;
+	int hdrlen;
+
+	/*
+	 * If the mbuf is small enough, we can pump it out as a work request
+	 * with only immediate data.  In that case we just have to have the
+	 * TX Packet header plus the mbuf data in the Work Request.
+	 */
+
+	hdrlen = is_eth_imm(m);
+	if (hdrlen)
+		return DIV_ROUND_UP(m->pkt_len + hdrlen, sizeof(__be64));
+
+	/*
+	 * Otherwise, we're going to have to construct a Scatter gather list
+	 * of the mbuf body and fragments.  We also include the flits necessary
+	 * for the TX Packet Work Request and CPL.  We always have a firmware
+	 * Write Header (incorporated as part of the cpl_tx_pkt_lso and
+	 * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
+	 * message or, if we're doing a Large Send Offload, an LSO CPL message
+	 * with an embeded TX Packet Write CPL message.
+	 */
+	flits = sgl_len(m->nb_segs);
+	if (m->tso_segsz)
+		flits += (sizeof(struct fw_eth_tx_pkt_wr) +
+			  sizeof(struct cpl_tx_pkt_lso_core) +
+			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+	else
+		flits += (sizeof(struct fw_eth_tx_pkt_wr) +
+			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+	return flits;
+}
+
+/**
+ * calc_tx_descs - calculate the number of Tx descriptors for a packet
+ * @m: the packet
+ *
+ * Returns the number of Tx descriptors needed for the given Ethernet
+ * packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int calc_tx_descs(const struct rte_mbuf *m)
+{
+	return flits_to_desc(calc_tx_flits(m));
+}
+
+/**
+ * write_sgl - populate a scatter/gather list for a packet
+ * @mbuf: the packet
+ * @q: the Tx queue we are writing into
+ * @sgl: starting location for writing the SGL
+ * @end: points right after the end of the SGL
+ * @start: start offset into mbuf main-body data to include in the SGL
+ * @addr: address of mapped region
+ *
+ * Generates a scatter/gather list for the buffers that make up a packet.
+ * The caller must provide adequate space for the SGL that will be written.
+ * The SGL includes all of the packet's page fragments and the data in its
+ * main body except for the first @start bytes.  @sgl must be 16-byte
+ * aligned and within a Tx descriptor with available space.  @end points
+ * write after the end of the SGL but does not account for any potential
+ * wrap around, i.e., @end > @sgl.
+ */
+static void write_sgl(struct rte_mbuf *mbuf, struct sge_txq *q,
+		      struct ulptx_sgl *sgl, u64 *end, unsigned int start,
+		      const dma_addr_t *addr)
+{
+	unsigned int i, len;
+	struct ulptx_sge_pair *to;
+	struct rte_mbuf *m = mbuf;
+	unsigned int nfrags = m->nb_segs;
+	struct ulptx_sge_pair buf[nfrags / 2];
+
+	len = m->data_len - start;
+	sgl->len0 = htonl(len);
+	sgl->addr0 = rte_cpu_to_be_64(addr[0]);
+
+	sgl->cmd_nsge = htonl(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
+			      V_ULPTX_NSGE(nfrags));
+	if (likely(--nfrags == 0))
+		return;
+	/*
+	 * Most of the complexity below deals with the possibility we hit the
+	 * end of the queue in the middle of writing the SGL.  For this case
+	 * only we create the SGL in a temporary buffer and then copy it.
+	 */
+	to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
+
+	for (i = 0; nfrags >= 2; nfrags -= 2, to++) {
+		m = m->next;
+		to->len[0] = rte_cpu_to_be_32(m->data_len);
+		to->addr[0] = rte_cpu_to_be_64(addr[++i]);
+		m = m->next;
+		to->len[1] = rte_cpu_to_be_32(m->data_len);
+		to->addr[1] = rte_cpu_to_be_64(addr[++i]);
+	}
+	if (nfrags) {
+		m = m->next;
+		to->len[0] = rte_cpu_to_be_32(m->data_len);
+		to->len[1] = rte_cpu_to_be_32(0);
+		to->addr[0] = rte_cpu_to_be_64(addr[i + 1]);
+	}
+	if (unlikely((u8 *)end > (u8 *)q->stat)) {
+		unsigned int part0 = RTE_PTR_DIFF((u8 *)q->stat,
+						  (u8 *)sgl->sge);
+		unsigned int part1;
+
+		if (likely(part0))
+			memcpy(sgl->sge, buf, part0);
+		part1 = RTE_PTR_DIFF((u8 *)end, (u8 *)q->stat);
+		rte_memcpy(q->desc, RTE_PTR_ADD((u8 *)buf, part0), part1);
+		end = RTE_PTR_ADD((void *)q->desc, part1);
+	}
+	if ((uintptr_t)end & 8)           /* 0-pad to multiple of 16 */
+		*(u64 *)end = 0;
+}
+
+#define IDXDIFF(head, tail, wrap) \
+	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
+
+#define Q_IDXDIFF(q, idx) IDXDIFF((q)->pidx, (q)->idx, (q)->size)
+
+/**
+ * ring_tx_db - ring a Tx queue's doorbell
+ * @adap: the adapter
+ * @q: the Tx queue
+ * @n: number of new descriptors to give to HW
+ *
+ * Ring the doorbel for a Tx queue.
+ */
+static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q)
+{
+	int n = Q_IDXDIFF(q, dbidx);
+
+	/*
+	 * Make sure that all writes to the TX Descriptors are committed
+	 * before we tell the hardware about them.
+	 */
+	rte_wmb();
+
+	/*
+	 * If we don't have access to the new User Doorbell (T5+), use the old
+	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
+	 */
+	if (unlikely(!q->bar2_addr)) {
+		u32 val = V_PIDX(n);
+
+		/*
+		 * For T4 we need to participate in the Doorbell Recovery
+		 * mechanism.
+		 */
+		if (!q->db_disabled)
+			t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+				     V_QID(q->cntxt_id) | val);
+		else
+			q->db_pidx_inc += n;
+		q->db_pidx = q->pidx;
+	} else {
+		u32 val = V_PIDX_T5(n);
+
+		/*
+		 * T4 and later chips share the same PIDX field offset within
+		 * the doorbell, but T5 and later shrank the field in order to
+		 * gain a bit for Doorbell Priority.  The field was absurdly
+		 * large in the first place (14 bits) so we just use the T5
+		 * and later limits and warn if a Queue ID is too large.
+		 */
+		WARN_ON(val & F_DBPRIO);
+
+		writel(val | V_QID(q->bar2_qid),
+		       (void *)((uintptr_t)q->bar2_addr + SGE_UDB_KDOORBELL));
+
+		/*
+		 * This Write Memory Barrier will force the write to the User
+		 * Doorbell area to be flushed.  This is needed to prevent
+		 * writes on different CPUs for the same queue from hitting
+		 * the adapter out of order.  This is required when some Work
+		 * Requests take the Write Combine Gather Buffer path (user
+		 * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
+		 * take the traditional path where we simply increment the
+		 * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
+		 * hardware DMA read the actual Work Request.
+		 */
+		rte_wmb();
+	}
+	q->dbidx = q->pidx;
+}
+
+/*
+ * Figure out what HW csum a packet wants and return the appropriate control
+ * bits.
+ */
+static u64 hwcsum(enum chip_type chip, const struct rte_mbuf *m)
+{
+	int csum_type;
+
+	if (m->ol_flags & PKT_TX_IP_CKSUM) {
+		switch (m->ol_flags & PKT_TX_L4_MASK) {
+		case PKT_TX_TCP_CKSUM:
+			csum_type = TX_CSUM_TCPIP;
+			break;
+		case PKT_TX_UDP_CKSUM:
+			csum_type = TX_CSUM_UDPIP;
+			break;
+		default:
+			goto nocsum;
+		}
+	} else {
+		goto nocsum;
+	}
+
+	if (likely(csum_type >= TX_CSUM_TCPIP)) {
+		int hdr_len = V_TXPKT_IPHDR_LEN(m->l3_len);
+		int eth_hdr_len = m->l2_len;
+
+		if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
+			hdr_len |= V_TXPKT_ETHHDR_LEN(eth_hdr_len);
+		else
+			hdr_len |= V_T6_TXPKT_ETHHDR_LEN(eth_hdr_len);
+		return V_TXPKT_CSUM_TYPE(csum_type) | hdr_len;
+	}
+nocsum:
+	/*
+	 * unknown protocol, disable HW csum
+	 * and hope a bad packet is detected
+	 */
+	return F_TXPKT_L4CSUM_DIS;
+}
+
+static inline void txq_advance(struct sge_txq *q, unsigned int n)
+{
+	q->in_use += n;
+	q->pidx += n;
+	if (q->pidx >= q->size)
+		q->pidx -= q->size;
+}
+
+#define MAX_COALESCE_LEN 64000
+
+static inline int wraps_around(struct sge_txq *q, int ndesc)
+{
+	return (q->pidx + ndesc) > q->size ? 1 : 0;
+}
+
+/**
+ * ship_tx_pkt_coalesce_wr - finalizes and ships a coalesce WR
+ * @ adap: adapter structure
+ * @txq: tx queue
+ * @left: how many more left
+ *
+ * writes the different fields of the pkts WR and sends it.
+ */
+static inline void ship_tx_pkt_coalesce_wr(struct adapter *adap,
+					   struct sge_eth_txq *txq,
+					   uint16_t left)
+{
+	u32 wr_mid;
+	struct sge_txq *q = &txq->q;
+	struct fw_eth_tx_pkts_wr *wr;
+	unsigned int ndesc;
+
+	/* fill the pkts WR header */
+	wr = (void *)&q->desc[q->pidx];
+	wr->op_pkd = htonl(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
+
+	wr_mid = V_FW_WR_LEN16(DIV_ROUND_UP(q->coalesce.flits, 2));
+	ndesc = flits_to_desc(q->coalesce.flits);
+	wr->equiq_to_len16 = htonl(wr_mid);
+	wr->plen = cpu_to_be16(q->coalesce.len);
+	wr->npkt = q->coalesce.idx;
+	wr->r3 = 0;
+	wr->type = q->coalesce.type;
+
+	/* zero out coalesce structure members */
+	q->coalesce.idx = 0;
+	q->coalesce.flits = 0;
+	q->coalesce.len = 0;
+
+	txq_advance(q, ndesc);
+	txq->stats.coal_wr++;
+	txq->stats.coal_pkts += wr->npkt;
+
+	if (Q_IDXDIFF(q, equeidx) >= q->size / 2) {
+		q->equeidx = q->pidx;
+		wr_mid |= F_FW_WR_EQUEQ;
+		wr->equiq_to_len16 = htonl(wr_mid);
+	}
+
+	if (left == 0) {
+		ring_tx_db(adap, q);
+		return;
+	}
+
+	if (Q_IDXDIFF(q, dbidx) >= SGE_MAX_WR_NDESC * 2)
+		ring_tx_db(adap, q);
+}
+
+/**
+ * should_tx_packet_coalesce - decides wether to coalesce an mbuf or not
+ * @txq: tx queue where the mbuf is sent
+ * @mbuf: mbuf to be sent
+ * @nflits: return value for number of flits needed
+ * @adap: adapter structure
+ * @left: how many more left
+ *
+ * This function decides if a packet should be coalesced or not.
+ */
+static inline int should_tx_packet_coalesce(struct sge_eth_txq *txq,
+					    struct rte_mbuf *mbuf,
+					    unsigned int *nflits,
+					    struct adapter *adap,
+					    uint16_t left)
+{
+	struct sge_txq *q = &txq->q;
+	unsigned int flits, ndesc;
+	unsigned char type = 0;
+	int credits, hw_cidx = ntohs(q->stat->cidx);
+	int in_use = q->pidx - hw_cidx + flits_to_desc(q->coalesce.flits);
+
+	/* use coal WR type 1 when no frags are present */
+	type = (mbuf->nb_segs == 1) ? 1 : 0;
+
+	if (in_use < 0)
+		in_use += q->size;
+
+	if (unlikely(type != q->coalesce.type && q->coalesce.idx))
+		ship_tx_pkt_coalesce_wr(adap, txq, left);
+
+	/* calculate the number of flits required for coalescing this packet
+	 * without the 2 flits of the WR header. These are added further down
+	 * if we are just starting in new PKTS WR. sgl_len doesn't account for
+	 * the possible 16 bytes alignment ULP TX commands so we do it here.
+	 */
+	flits = (sgl_len(mbuf->nb_segs) + 1) & ~1U;
+	if (type == 0)
+		flits += (sizeof(struct ulp_txpkt) +
+			  sizeof(struct ulptx_idata)) / sizeof(__be64);
+	flits += sizeof(struct cpl_tx_pkt_core) / sizeof(__be64);
+	*nflits = flits;
+
+	/* If coalescing is on, the mbuf is added to a pkts WR */
+	if (q->coalesce.idx) {
+		ndesc = DIV_ROUND_UP(q->coalesce.flits + flits, 8);
+		credits = txq_avail(q) - ndesc;
+
+		/* If we are wrapping or this is last mbuf then, send the
+		 * already coalesced mbufs and let the non-coalesce pass
+		 * handle the mbuf.
+		 */
+		if (unlikely(credits < 0 || wraps_around(q, ndesc))) {
+			ship_tx_pkt_coalesce_wr(adap, txq, left);
+			return 0;
+		}
+
+		/* If the max coalesce len or the max WR len is reached
+		 * ship the WR and keep coalescing on.
+		 */
+		if (unlikely((q->coalesce.len + mbuf->pkt_len >
+						MAX_COALESCE_LEN) ||
+			     (q->coalesce.flits + flits >
+			      q->coalesce.max))) {
+			ship_tx_pkt_coalesce_wr(adap, txq, left);
+			goto new;
+		}
+		return 1;
+	}
+
+new:
+	/* start a new pkts WR, the WR header is not filled below */
+	flits += sizeof(struct fw_eth_tx_pkts_wr) / sizeof(__be64);
+	ndesc = flits_to_desc(q->coalesce.flits + flits);
+	credits = txq_avail(q) - ndesc;
+
+	if (unlikely(credits < 0 || wraps_around(q, ndesc)))
+		return 0;
+	q->coalesce.flits += 2;
+	q->coalesce.type = type;
+	q->coalesce.ptr = (unsigned char *)&q->desc[q->pidx] +
+			   2 * sizeof(__be64);
+	return 1;
+}
+
+/**
+ * tx_do_packet_coalesce - add an mbuf to a coalesce WR
+ * @txq: sge_eth_txq used send the mbuf
+ * @mbuf: mbuf to be sent
+ * @flits: flits needed for this mbuf
+ * @adap: adapter structure
+ * @pi: port_info structure
+ * @addr: mapped address of the mbuf
+ * @left: how many more left
+ *
+ * Adds an mbuf to be sent as part of a coalesce WR by filling a
+ * ulp_tx_pkt command, ulp_tx_sc_imm command, cpl message and
+ * ulp_tx_sc_dsgl command.
+ */
+static inline int tx_do_packet_coalesce(struct sge_eth_txq *txq,
+					struct rte_mbuf *mbuf,
+					int flits, struct adapter *adap,
+					const struct port_info *pi,
+					dma_addr_t *addr,
+					uint16_t left)
+{
+	u64 cntrl, *end;
+	struct sge_txq *q = &txq->q;
+	struct ulp_txpkt *mc;
+	struct ulptx_idata *sc_imm;
+	struct cpl_tx_pkt_core *cpl;
+	struct tx_sw_desc *sd;
+	unsigned int idx = q->coalesce.idx, len = mbuf->pkt_len;
+
+	if (q->coalesce.type == 0) {
+		mc = (struct ulp_txpkt *)q->coalesce.ptr;
+		mc->cmd_dest = htonl(V_ULPTX_CMD(4) | V_ULP_TXPKT_DEST(0) |
+				     V_ULP_TXPKT_FID(adap->sge.fw_evtq.cntxt_id) |
+				     F_ULP_TXPKT_RO);
+		mc->len = htonl(DIV_ROUND_UP(flits, 2));
+		sc_imm = (struct ulptx_idata *)(mc + 1);
+		sc_imm->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM) |
+					 F_ULP_TX_SC_MORE);
+		sc_imm->len = htonl(sizeof(*cpl));
+		end = (u64 *)mc + flits;
+		cpl = (struct cpl_tx_pkt_core *)(sc_imm + 1);
+	} else {
+		end = (u64 *)q->coalesce.ptr + flits;
+		cpl = (struct cpl_tx_pkt_core *)q->coalesce.ptr;
+	}
+
+	/* update coalesce structure for this txq */
+	q->coalesce.flits += flits;
+	q->coalesce.ptr += flits * sizeof(__be64);
+	q->coalesce.len += mbuf->pkt_len;
+
+	/* fill the cpl message, same as in t4_eth_xmit, this should be kept
+	 * similar to t4_eth_xmit
+	 */
+	if (mbuf->ol_flags & PKT_TX_IP_CKSUM) {
+		cntrl = hwcsum(adap->params.chip, mbuf) |
+			       F_TXPKT_IPCSUM_DIS;
+		txq->stats.tx_cso++;
+	} else {
+		cntrl = F_TXPKT_L4CSUM_DIS | F_TXPKT_IPCSUM_DIS;
+	}
+
+	if (mbuf->ol_flags & PKT_TX_VLAN_PKT) {
+		txq->stats.vlan_ins++;
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(mbuf->vlan_tci);
+	}
+
+	cpl->ctrl0 = htonl(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
+			   V_TXPKT_INTF(pi->tx_chan) |
+			   V_TXPKT_PF(adap->pf));
+	cpl->pack = htons(0);
+	cpl->len = htons(len);
+	cpl->ctrl1 = cpu_to_be64(cntrl);
+	write_sgl(mbuf, q, (struct ulptx_sgl *)(cpl + 1), end, 0,  addr);
+	txq->stats.pkts++;
+	txq->stats.tx_bytes += len;
+
+	sd = &q->sdesc[q->pidx + (idx >> 1)];
+	if (!(idx & 1)) {
+		if (sd->coalesce.idx) {
+			int i;
+
+			for (i = 0; i < sd->coalesce.idx; i++) {
+				rte_pktmbuf_free(sd->coalesce.mbuf[i]);
+				sd->coalesce.mbuf[i] = NULL;
+			}
+		}
+	}
+
+	/* store pointers to the mbuf and the sgl used in free_tx_desc.
+	 * each tx desc can hold two pointers corresponding to the value
+	 * of ETH_COALESCE_PKT_PER_DESC
+	 */
+	sd->coalesce.mbuf[idx & 1] = mbuf;
+	sd->coalesce.sgl[idx & 1] = (struct ulptx_sgl *)(cpl + 1);
+	sd->coalesce.idx = (idx & 1) + 1;
+
+	/* send the coaelsced work request if max reached */
+	if (++q->coalesce.idx == ETH_COALESCE_PKT_NUM || left == 0)
+		ship_tx_pkt_coalesce_wr(adap, txq, left);
+	return 0;
+}
+
+/**
+ * t4_eth_xmit - add a packet to an Ethernet Tx queue
+ * @txq: the egress queue
+ * @mbuf: the packet
+ * @left: how many more left
+ *
+ * Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
+ */
+int t4_eth_xmit(struct sge_eth_txq *txq, struct rte_mbuf *mbuf, uint16_t left)
+{
+	const struct port_info *pi;
+	struct cpl_tx_pkt_lso_core *lso;
+	struct adapter *adap;
+	struct rte_mbuf *m = mbuf;
+	struct fw_eth_tx_pkt_wr *wr;
+	struct cpl_tx_pkt_core *cpl;
+	struct tx_sw_desc *d;
+	dma_addr_t addr[m->nb_segs];
+	unsigned int flits, ndesc, cflits;
+	int l3hdr_len, l4hdr_len, eth_xtra_len;
+	int len, last_desc;
+	int credits;
+	u32 wr_mid;
+	u64 cntrl, *end;
+	bool v6;
+
+	/* Reject xmit if queue is stopped */
+	if (unlikely(txq->flags & EQ_STOPPED))
+		return -(EBUSY);
+
+	/*
+	 * The chip min packet length is 10 octets but play safe and reject
+	 * anything shorter than an Ethernet header.
+	 */
+	if (unlikely(m->pkt_len < ETHER_HDR_LEN)) {
+out_free:
+		rte_pktmbuf_free(m);
+		return 0;
+	}
+
+	rte_prefetch0(&((&txq->q)->sdesc->mbuf->pool));
+	pi = (struct port_info *)txq->eth_dev->data->dev_private;
+	adap = pi->adapter;
+
+	cntrl = F_TXPKT_L4CSUM_DIS | F_TXPKT_IPCSUM_DIS;
+	/* align the end of coalesce WR to a 512 byte boundary */
+	txq->q.coalesce.max = (8 - (txq->q.pidx & 7)) * 8;
+
+	if (!(m->ol_flags & PKT_TX_TCP_SEG)) {
+		if (should_tx_packet_coalesce(txq, mbuf, &cflits, adap, left)) {
+			if (unlikely(map_mbuf(mbuf, addr) < 0)) {
+				dev_warn(adap, "%s: mapping err for coalesce\n",
+					 __func__);
+				txq->stats.mapping_err++;
+				goto out_free;
+			}
+			return tx_do_packet_coalesce(txq, mbuf, cflits, adap,
+						     pi, addr, left);
+		} else {
+			return -EBUSY;
+		}
+	}
+
+	if (txq->q.coalesce.idx)
+		ship_tx_pkt_coalesce_wr(adap, txq, left);
+
+	flits = calc_tx_flits(m);
+	ndesc = flits_to_desc(flits);
+	credits = txq_avail(&txq->q) - ndesc;
+
+	if (unlikely(credits < 0)) {
+		dev_debug(adap, "%s: Tx ring %u full; credits = %d\n",
+			  __func__, txq->q.cntxt_id, credits);
+		return -EBUSY;
+	}
+
+	if (unlikely(map_mbuf(m, addr) < 0)) {
+		txq->stats.mapping_err++;
+		goto out_free;
+	}
+
+	wr_mid = V_FW_WR_LEN16(DIV_ROUND_UP(flits, 2));
+	if (Q_IDXDIFF(&txq->q, equeidx)  >= 64 || left == 0) {
+		txq->q.equeidx = txq->q.pidx;
+		wr_mid |= F_FW_WR_EQUEQ;
+	}
+
+	wr = (void *)&txq->q.desc[txq->q.pidx];
+	wr->equiq_to_len16 = htonl(wr_mid);
+	wr->r3 = rte_cpu_to_be_64(0);
+	end = (u64 *)wr + flits;
+
+	len = 0;
+	len += sizeof(*cpl);
+	lso = (void *)(wr + 1);
+	v6 = (m->ol_flags & PKT_TX_IPV6) != 0;
+	l3hdr_len = m->l3_len;
+	l4hdr_len = m->l4_len;
+	eth_xtra_len = m->l2_len - ETHER_HDR_LEN;
+	len += sizeof(*lso);
+	wr->op_immdlen = htonl(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
+			       V_FW_WR_IMMDLEN(len));
+	lso->lso_ctrl = htonl(V_LSO_OPCODE(CPL_TX_PKT_LSO) |
+			      F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
+			      V_LSO_IPV6(v6) |
+			      V_LSO_ETHHDR_LEN(eth_xtra_len / 4) |
+			      V_LSO_IPHDR_LEN(l3hdr_len / 4) |
+			      V_LSO_TCPHDR_LEN(l4hdr_len / 4));
+	lso->ipid_ofst = htons(0);
+	lso->mss = htons(m->tso_segsz);
+	lso->seqno_offset = htonl(0);
+	if (is_t4(adap->params.chip))
+		lso->len = htonl(m->pkt_len);
+	else
+		lso->len = htonl(V_LSO_T5_XFER_SIZE(m->pkt_len));
+	cpl = (void *)(lso + 1);
+	cntrl = V_TXPKT_CSUM_TYPE(v6 ? TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
+				  V_TXPKT_IPHDR_LEN(l3hdr_len) |
+				  V_TXPKT_ETHHDR_LEN(eth_xtra_len);
+	txq->stats.tso++;
+	txq->stats.tx_cso += m->tso_segsz;
+
+	if (m->ol_flags & PKT_TX_VLAN_PKT) {
+		txq->stats.vlan_ins++;
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->vlan_tci);
+	}
+
+	cpl->ctrl0 = htonl(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
+			   V_TXPKT_INTF(pi->tx_chan) |
+			   V_TXPKT_PF(adap->pf));
+	cpl->pack = htons(0);
+	cpl->len = htons(m->pkt_len);
+	cpl->ctrl1 = cpu_to_be64(cntrl);
+
+	txq->stats.pkts++;
+	txq->stats.tx_bytes += m->pkt_len;
+	last_desc = txq->q.pidx + ndesc - 1;
+	if (last_desc >= (int)txq->q.size)
+		last_desc -= txq->q.size;
+
+	d = &txq->q.sdesc[last_desc];
+	if (d->mbuf) {
+		rte_pktmbuf_free(d->mbuf);
+		d->mbuf = NULL;
+	}
+	write_sgl(m, &txq->q, (struct ulptx_sgl *)(cpl + 1), end, 0,
+		  addr);
+	txq->q.sdesc[last_desc].mbuf = m;
+	txq->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)(cpl + 1);
+	txq_advance(&txq->q, ndesc);
+	ring_tx_db(adap, &txq->q);
+	return 0;
+}
+
+/**
+ * alloc_ring - allocate resources for an SGE descriptor ring
+ * @dev: the PCI device's core device
+ * @nelem: the number of descriptors
+ * @elem_size: the size of each descriptor
+ * @sw_size: the size of the SW state associated with each ring element
+ * @phys: the physical address of the allocated ring
+ * @metadata: address of the array holding the SW state for the ring
+ * @stat_size: extra space in HW ring for status information
+ * @node: preferred node for memory allocations
+ *
+ * Allocates resources for an SGE descriptor ring, such as Tx queues,
+ * free buffer lists, or response queues.  Each SGE ring requires
+ * space for its HW descriptors plus, optionally, space for the SW state
+ * associated with each HW entry (the metadata).  The function returns
+ * three values: the virtual address for the HW ring (the return value
+ * of the function), the bus address of the HW ring, and the address
+ * of the SW ring.
+ */
+static void *alloc_ring(size_t nelem, size_t elem_size,
+			size_t sw_size, dma_addr_t *phys, void *metadata,
+			size_t stat_size, __rte_unused uint16_t queue_id,
+			int socket_id, const char *z_name,
+			const char *z_name_sw)
+{
+	size_t len = CXGBE_MAX_RING_DESC_SIZE * elem_size + stat_size;
+	const struct rte_memzone *tz;
+	void *s = NULL;
+
+	dev_debug(adapter, "%s: nelem = %lu; elem_size = %lu; sw_size = %lu; "
+		  "stat_size = %lu; queue_id = %u; socket_id = %d; z_name = %s;"
+		  " z_name_sw = %s\n", __func__, nelem, elem_size, sw_size,
+		  stat_size, queue_id, socket_id, z_name, z_name_sw);
+
+	tz = rte_memzone_lookup(z_name);
+	if (tz) {
+		dev_debug(adapter, "%s: tz exists...returning existing..\n",
+			  __func__);
+		goto alloc_sw_ring;
+	}
+
+	/*
+	 * Allocate TX/RX ring hardware descriptors. A memzone large enough to
+	 * handle the maximum ring size is allocated in order to allow for
+	 * resizing in later calls to the queue setup function.
+	 */
+	tz = rte_memzone_reserve_aligned(z_name, len, socket_id, 0, 4096);
+	if (!tz)
+		return NULL;
+
+alloc_sw_ring:
+	memset(tz->addr, 0, len);
+	if (sw_size) {
+		s = rte_zmalloc_socket(z_name_sw, nelem * sw_size,
+				       RTE_CACHE_LINE_SIZE, socket_id);
+
+		if (!s) {
+			dev_err(adapter, "%s: failed to get sw_ring memory\n",
+				__func__);
+			return NULL;
+		}
+	}
+	if (metadata)
+		*(void **)metadata = s;
+
+	*phys = (uint64_t)tz->phys_addr;
+	return tz->addr;
+}
+
+/**
+ * t4_pktgl_to_mbuf_usembufs - build an mbuf from a packet gather list
+ * @gl: the gather list
+ *
+ * Builds an mbuf from the given packet gather list.  Returns the mbuf or
+ * %NULL if mbuf allocation failed.
+ */
+static struct rte_mbuf *t4_pktgl_to_mbuf_usembufs(const struct pkt_gl *gl)
+{
+	/*
+	 * If there's only one mbuf fragment, just return that.
+	 */
+	if (likely(gl->nfrags == 1))
+		return gl->mbufs[0];
+
+	return NULL;
+}
+
+/**
+ * t4_pktgl_to_mbuf - build an mbuf from a packet gather list
+ * @gl: the gather list
+ *
+ * Builds an mbuf from the given packet gather list.  Returns the mbuf or
+ * %NULL if mbuf allocation failed.
+ */
+static struct rte_mbuf *t4_pktgl_to_mbuf(const struct pkt_gl *gl)
+{
+	return t4_pktgl_to_mbuf_usembufs(gl);
+}
+
+#define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
+	((dma_addr_t) ((mb)->buf_physaddr + (mb)->data_off))
+
+/**
+ * t4_ethrx_handler - process an ingress ethernet packet
+ * @q: the response queue that received the packet
+ * @rsp: the response queue descriptor holding the RX_PKT message
+ * @si: the gather list of packet fragments
+ *
+ * Process an ingress ethernet packet and deliver it to the stack.
+ */
+int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
+		     const struct pkt_gl *si)
+{
+	struct rte_mbuf *mbuf;
+	const struct cpl_rx_pkt *pkt;
+	const struct rss_header *rss_hdr;
+	bool csum_ok;
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
+	struct adapter *adapter = q->adapter;
+	struct sge *s = &adapter->sge;
+
+	rss_hdr = (const void *)rsp;
+	pkt = (const void *)&rsp[1];
+	csum_ok = pkt->csum_calc && !pkt->err_vec;
+
+	mbuf = t4_pktgl_to_mbuf(si);
+	if (unlikely(!mbuf)) {
+		rxq->stats.rx_drops++;
+		return 0;
+	}
+
+	rte_pktmbuf_adj(mbuf, s->pktshift);
+	mbuf->port = pkt->iff;
+	if (pkt->l2info & htonl(F_RXF_IP)) {
+		mbuf->ol_flags |= PKT_RX_IPV4_HDR;
+		if (unlikely(!csum_ok))
+			mbuf->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+
+		if ((pkt->l2info & htonl(F_RXF_UDP | F_RXF_TCP)) && !csum_ok)
+			mbuf->ol_flags |= PKT_RX_L4_CKSUM_BAD;
+	} else if (pkt->l2info & htonl(F_RXF_IP6)) {
+		mbuf->ol_flags |= PKT_RX_IPV6_HDR;
+	}
+
+	mbuf->port = pkt->iff;
+
+	if (!rss_hdr->filter_tid && rss_hdr->hash_type) {
+		mbuf->ol_flags |= PKT_RX_RSS_HASH;
+		mbuf->hash.rss = ntohl(rss_hdr->hash_val);
+	}
+
+	if (pkt->vlan_ex) {
+		mbuf->ol_flags |= PKT_RX_VLAN_PKT;
+		mbuf->vlan_tci = ntohs(pkt->vlan);
+	}
+	rxq->stats.pkts++;
+	rxq->stats.rx_bytes += mbuf->pkt_len;
+
+	return 0;
+}
+
+/**
+ * restore_rx_bufs - put back a packet's Rx buffers
+ * @q: the SGE free list
+ * @frags: number of FL buffers to restore
+ *
+ * Puts back on an FL the Rx buffers.  The buffers have already been
+ * unmapped and are left unmapped, we mark them so to prevent further
+ * unmapping attempts.
+ *
+ * This function undoes a series of @unmap_rx_buf calls when we find out
+ * that the current packet can't be processed right away afterall and we
+ * need to come back to it later.  This is a very rare event and there's
+ * no effort to make this particularly efficient.
+ */
+static void restore_rx_bufs(struct sge_fl *q, int frags)
+{
+	while (frags--) {
+		if (q->cidx == 0)
+			q->cidx = q->size - 1;
+		else
+			q->cidx--;
+		q->avail++;
+	}
+}
+
+/**
+ * is_new_response - check if a response is newly written
+ * @r: the response descriptor
+ * @q: the response queue
+ *
+ * Returns true if a response descriptor contains a yet unprocessed
+ * response.
+ */
+static inline bool is_new_response(const struct rsp_ctrl *r,
+				   const struct sge_rspq *q)
+{
+	return (r->u.type_gen >> S_RSPD_GEN) == q->gen;
+}
+
+#define CXGB4_MSG_AN ((void *)1)
+
+/**
+ * rspq_next - advance to the next entry in a response queue
+ * @q: the queue
+ *
+ * Updates the state of a response queue to advance it to the next entry.
+ */
+static inline void rspq_next(struct sge_rspq *q)
+{
+	q->cur_desc = (const __be64 *)((const char *)q->cur_desc + q->iqe_len);
+	if (unlikely(++q->cidx == q->size)) {
+		q->cidx = 0;
+		q->gen ^= 1;
+		q->cur_desc = q->desc;
+	}
+}
+
+/**
+ * process_responses - process responses from an SGE response queue
+ * @q: the ingress queue to process
+ * @budget: how many responses can be processed in this round
+ * @rx_pkts: mbuf to put the pkts
+ *
+ * Process responses from an SGE response queue up to the supplied budget.
+ * Responses include received packets as well as control messages from FW
+ * or HW.
+ *
+ * Additionally choose the interrupt holdoff time for the next interrupt
+ * on this queue.  If the system is under memory shortage use a fairly
+ * long delay to help recovery.
+ */
+static int process_responses(struct sge_rspq *q, int budget,
+			     struct rte_mbuf **rx_pkts)
+{
+	int ret = 0, rsp_type;
+	int budget_left = budget;
+	const struct rsp_ctrl *rc;
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
+	struct adapter *adapter = q->adapter;
+
+	while (likely(budget_left)) {
+		rc = (const struct rsp_ctrl *)
+		     ((const char *)q->cur_desc + (q->iqe_len - sizeof(*rc)));
+
+		if (!is_new_response(rc, q))
+			break;
+
+		/*
+		 * Ensure response has been read
+		 */
+		rmb();
+		rsp_type = G_RSPD_TYPE(rc->u.type_gen);
+
+		if (likely(rsp_type == X_RSPD_TYPE_FLBUF)) {
+			struct pkt_gl si;
+			const struct rx_sw_desc *rsd;
+			struct rte_mbuf *pkt = NULL;
+			u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags;
+
+			si.usembufs = rxq->usembufs;
+			/*
+			 * In "use mbufs" mode, we don't pack multiple
+			 * ingress packets per buffer (mbuf) so we
+			 * should _always_ get a "New Buffer" flags
+			 * from the SGE.  Also, since we hand the
+			 * mbuf's up to the host stack for it to
+			 * eventually free, we don't release the mbuf's
+			 * in the driver (in contrast to the "packed
+			 * page" mode where the driver needs to
+			 * release its reference on the page buffers).
+			 */
+			BUG_ON(!(len & F_RSPD_NEWBUF));
+			len = G_RSPD_LEN(len);
+			si.tot_len = len;
+
+			/* gather packet fragments */
+			for (frags = 0; len; frags++) {
+				rsd = &rxq->fl.sdesc[rxq->fl.cidx];
+				bufsz = min(get_buf_size(adapter, rsd),	len);
+				pkt = rsd->buf;
+				pkt->data_len = bufsz;
+				pkt->pkt_len = bufsz;
+				si.mbufs[frags] = pkt;
+				len -= bufsz;
+				unmap_rx_buf(&rxq->fl);
+			}
+
+			si.va = RTE_PTR_ADD(si.mbufs[0]->buf_addr,
+					    si.mbufs[0]->data_off);
+			rte_prefetch1(si.va);
+
+			/*
+			 * For the "use mbuf" case here, we can end up
+			 * chewing through our Free List very rapidly
+			 * with one entry per Ingress packet getting
+			 * consumed.  So if the handler() successfully
+			 * consumed the mbuf, check to see if we can
+			 * refill the Free List incrementally in the
+			 * loop ...
+			 */
+			si.nfrags = frags;
+			ret = q->handler(q, q->cur_desc, &si);
+
+			if (unlikely(ret != 0)) {
+				restore_rx_bufs(&rxq->fl, frags);
+			} else {
+				rx_pkts[budget - budget_left] = pkt;
+				if (fl_cap(&rxq->fl) - rxq->fl.avail >= 8)
+					__refill_fl(q->adapter, &rxq->fl);
+			}
+
+		} else if (likely(rsp_type == X_RSPD_TYPE_CPL)) {
+			ret = q->handler(q, q->cur_desc, NULL);
+		} else {
+			ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN);
+		}
+
+		if (unlikely(ret)) {
+			/* couldn't process descriptor, back off for recovery */
+			q->next_intr_params = V_QINTR_TIMER_IDX(NOMEM_TMR_IDX);
+			break;
+		}
+
+		rspq_next(q);
+		budget_left--;
+	}
+
+	/*
+	 * If this is a Response Queue with an associated Free List and
+	 * there's room for another chunk of new Free List buffer pointers,
+	 * refill the Free List.
+	 */
+
+	if (q->offset >= 0 && fl_cap(&rxq->fl) - rxq->fl.avail >= 8)
+		__refill_fl(q->adapter, &rxq->fl);
+
+	return budget - budget_left;
+}
+
+int cxgbe_poll(struct sge_rspq *q, struct rte_mbuf **rx_pkts,
+	       unsigned int budget, unsigned int *work_done)
+{
+	unsigned int params;
+	u32 val;
+	int err = 0;
+
+	*work_done = process_responses(q, budget, rx_pkts);
+	params = V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX);
+	q->next_intr_params = params;
+	val = V_CIDXINC(*work_done) | V_SEINTARM(params);
+
+	if (*work_done) {
+		/*
+		 * If we don't have access to the new User GTS (T5+),
+		 * use the old doorbell mechanism; otherwise use the new
+		 * BAR2 mechanism.
+		 */
+		if (unlikely(!q->bar2_addr))
+			t4_write_reg(q->adapter, MYPF_REG(A_SGE_PF_GTS),
+				     val | V_INGRESSQID((u32)q->cntxt_id));
+		else {
+			writel(val | V_INGRESSQID(q->bar2_qid),
+			       (void *)((uintptr_t)q->bar2_addr +
+			       SGE_UDB_GTS));
+			/*
+			 * This Write memory Barrier will force the write to
+			 * the User Doorbell area to be flushed.
+			 */
+			wmb();
+		}
+	}
+
+	return err;
+}
+
+/**
+ * bar2_address - return the BAR2 address for an SGE Queue's Registers
+ * @adapter: the adapter
+ * @qid: the SGE Queue ID
+ * @qtype: the SGE Queue Type (Egress or Ingress)
+ * @pbar2_qid: BAR2 Queue ID or 0 for Queue ID inferred SGE Queues
+ *
+ * Returns the BAR2 address for the SGE Queue Registers associated with
+ * @qid.  If BAR2 SGE Registers aren't available, returns NULL.  Also
+ * returns the BAR2 Queue ID to be used with writes to the BAR2 SGE
+ * Queue Registers.  If the BAR2 Queue ID is 0, then "Inferred Queue ID"
+ * Registers are supported (e.g. the Write Combining Doorbell Buffer).
+ */
+static void __iomem *bar2_address(struct adapter *adapter, unsigned int qid,
+				  enum t4_bar2_qtype qtype,
+				  unsigned int *pbar2_qid)
+{
+	u64 bar2_qoffset;
+	int ret;
+
+	ret = t4_bar2_sge_qregs(adapter, qid, qtype, &bar2_qoffset, pbar2_qid);
+	if (ret)
+		return NULL;
+
+	return adapter->bar2 + bar2_qoffset;
+}
+
+int t4_sge_eth_rxq_start(struct adapter *adap, struct sge_rspq *rq)
+{
+	struct sge_eth_rxq *rxq = container_of(rq, struct sge_eth_rxq, rspq);
+	unsigned int fl_id = rxq->fl.size ? rxq->fl.cntxt_id : 0xffff;
+
+	return t4_iq_start_stop(adap, adap->mbox, true, adap->pf, 0,
+				rq->cntxt_id, fl_id, 0xffff);
+}
+
+int t4_sge_eth_rxq_stop(struct adapter *adap, struct sge_rspq *rq)
+{
+	struct sge_eth_rxq *rxq = container_of(rq, struct sge_eth_rxq, rspq);
+	unsigned int fl_id = rxq->fl.size ? rxq->fl.cntxt_id : 0xffff;
+
+	return t4_iq_start_stop(adap, adap->mbox, false, adap->pf, 0,
+				rq->cntxt_id, fl_id, 0xffff);
+}
+
+/*
+ * @intr_idx: MSI/MSI-X vector if >=0, -(absolute qid + 1) if < 0
+ * @cong: < 0 -> no congestion feedback, >= 0 -> congestion channel map
+ */
+int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
+		     struct rte_eth_dev *eth_dev, int intr_idx,
+		     struct sge_fl *fl, rspq_handler_t hnd, int cong,
+		     struct rte_mempool *mp, int queue_id, int socket_id)
+{
+	int ret, flsz = 0;
+	struct fw_iq_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	char z_name[RTE_MEMZONE_NAMESIZE];
+	char z_name_sw[RTE_MEMZONE_NAMESIZE];
+	unsigned int nb_refill;
+
+	/* Size needs to be multiple of 16, including status entry. */
+	iq->size = roundup(iq->size, 16);
+
+	snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
+		 eth_dev->driver->pci_drv.name, fwevtq ? "fwq_ring" : "rx_ring",
+		 eth_dev->data->port_id, queue_id);
+	snprintf(z_name_sw, sizeof(z_name_sw), "%s_sw_ring", z_name);
+
+	iq->desc = alloc_ring(iq->size, iq->iqe_len, 0, &iq->phys_addr, NULL, 0,
+			      queue_id, socket_id, z_name, z_name_sw);
+	if (!iq->desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
+			    F_FW_CMD_WRITE | F_FW_CMD_EXEC |
+			    V_FW_IQ_CMD_PFN(adap->pf) | V_FW_IQ_CMD_VFN(0));
+	c.alloc_to_len16 = htonl(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
+				 (sizeof(c) / 16));
+	c.type_to_iqandstindex =
+		htonl(V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
+		      V_FW_IQ_CMD_IQASYNCH(fwevtq) |
+		      V_FW_IQ_CMD_VIID(pi->viid) |
+		      V_FW_IQ_CMD_IQANDST(intr_idx < 0) |
+		      V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT) |
+		      V_FW_IQ_CMD_IQANDSTINDEX(intr_idx >= 0 ? intr_idx :
+							       -intr_idx - 1));
+	c.iqdroprss_to_iqesize =
+		htons(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
+		      F_FW_IQ_CMD_IQGTSMODE |
+		      V_FW_IQ_CMD_IQINTCNTTHRESH(iq->pktcnt_idx) |
+		      V_FW_IQ_CMD_IQESIZE(ilog2(iq->iqe_len) - 4));
+	c.iqsize = htons(iq->size);
+	c.iqaddr = cpu_to_be64(iq->phys_addr);
+	if (cong >= 0)
+		c.iqns_to_fl0congen = htonl(F_FW_IQ_CMD_IQFLINTCONGEN);
+
+	if (fl) {
+		struct sge_eth_rxq *rxq = container_of(fl, struct sge_eth_rxq,
+						       fl);
+		enum chip_type chip = CHELSIO_CHIP_VERSION(adap->params.chip);
+
+		/*
+		 * Allocate the ring for the hardware free list (with space
+		 * for its status page) along with the associated software
+		 * descriptor ring.  The free list size needs to be a multiple
+		 * of the Egress Queue Unit and at least 2 Egress Units larger
+		 * than the SGE's Egress Congrestion Threshold
+		 * (fl_starve_thres - 1).
+		 */
+		if (fl->size < s->fl_starve_thres - 1 + 2 * 8)
+			fl->size = s->fl_starve_thres - 1 + 2 * 8;
+		fl->size = roundup(fl->size, 8);
+
+		snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
+			 eth_dev->driver->pci_drv.name,
+			 fwevtq ? "fwq_ring" : "fl_ring",
+			 eth_dev->data->port_id, queue_id);
+		snprintf(z_name_sw, sizeof(z_name_sw), "%s_sw_ring", z_name);
+
+		fl->desc = alloc_ring(fl->size, sizeof(__be64),
+				      sizeof(struct rx_sw_desc),
+				      &fl->addr, &fl->sdesc, s->stat_len,
+				      queue_id, socket_id, z_name, z_name_sw);
+
+		if (!fl->desc)
+			goto fl_nomem;
+
+		flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc);
+		c.iqns_to_fl0congen |=
+			htonl(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
+			      (unlikely(rxq->usembufs) ?
+			       0 : F_FW_IQ_CMD_FL0PACKEN) |
+			      F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
+			      F_FW_IQ_CMD_FL0PADEN);
+		if (cong >= 0)
+			c.iqns_to_fl0congen |=
+				htonl(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
+				      F_FW_IQ_CMD_FL0CONGCIF |
+				      F_FW_IQ_CMD_FL0CONGEN);
+
+		/* In T6, for egress queue type FL there is internal overhead
+		 * of 16B for header going into FLM module.
+		 * Hence maximum allowed burst size will be 448 bytes.
+		 */
+		c.fl0dcaen_to_fl0cidxfthresh =
+			htons(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_64B) |
+			      V_FW_IQ_CMD_FL0FBMAX((chip <= CHELSIO_T5) ?
+			      X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
+		c.fl0size = htons(flsz);
+		c.fl0addr = cpu_to_be64(fl->addr);
+	}
+
+	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
+	if (ret)
+		goto err;
+
+	iq->cur_desc = iq->desc;
+	iq->cidx = 0;
+	iq->gen = 1;
+	iq->next_intr_params = iq->intr_params;
+	iq->cntxt_id = ntohs(c.iqid);
+	iq->abs_id = ntohs(c.physiqid);
+	iq->bar2_addr = bar2_address(adap, iq->cntxt_id, T4_BAR2_QTYPE_INGRESS,
+				     &iq->bar2_qid);
+	iq->size--;                           /* subtract status entry */
+	iq->eth_dev = eth_dev;
+	iq->handler = hnd;
+	iq->mb_pool = mp;
+
+	/* set offset to -1 to distinguish ingress queues without FL */
+	iq->offset = fl ? 0 : -1;
+
+	if (fl) {
+		fl->cntxt_id = ntohs(c.fl0id);
+		fl->avail = 0;
+		fl->pend_cred = 0;
+		fl->pidx = 0;
+		fl->cidx = 0;
+		fl->alloc_failed = 0;
+
+		/*
+		 * Note, we must initialize the BAR2 Free List User Doorbell
+		 * information before refilling the Free List!
+		 */
+		fl->bar2_addr = bar2_address(adap, fl->cntxt_id,
+					     T4_BAR2_QTYPE_EGRESS,
+					     &fl->bar2_qid);
+
+		nb_refill = refill_fl(adap, fl, fl_cap(fl));
+		if (nb_refill != fl_cap(fl)) {
+			ret = -ENOMEM;
+			dev_err(adap, "%s: mbuf alloc failed with error: %d\n",
+				__func__, ret);
+			goto refill_fl_err;
+		}
+	}
+
+	/*
+	 * For T5 and later we attempt to set up the Congestion Manager values
+	 * of the new RX Ethernet Queue.  This should really be handled by
+	 * firmware because it's more complex than any host driver wants to
+	 * get involved with and it's different per chip and this is almost
+	 * certainly wrong.  Formware would be wrong as well, but it would be
+	 * a lot easier to fix in one place ...  For now we do something very
+	 * simple (and hopefully less wrong).
+	 */
+	if (!is_t4(adap->params.chip) && cong >= 0) {
+		u32 param, val;
+		int i;
+
+		param = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
+			 V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
+			 V_FW_PARAMS_PARAM_YZ(iq->cntxt_id));
+		if (cong == 0) {
+			val = V_CONMCTXT_CNGTPMODE(X_CONMCTXT_CNGTPMODE_QUEUE);
+		} else {
+			val = V_CONMCTXT_CNGTPMODE(
+					X_CONMCTXT_CNGTPMODE_CHANNEL);
+			for (i = 0; i < 4; i++) {
+				if (cong & (1 << i))
+					val |= V_CONMCTXT_CNGCHMAP(1 <<
+								   (i << 2));
+			}
+		}
+		ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1,
+				    &param, &val);
+		if (ret)
+			dev_warn(adap->pdev_dev, "Failed to set Congestion Manager Context for Ingress Queue %d: %d\n",
+				 iq->cntxt_id, -ret);
+	}
+
+	return 0;
+
+refill_fl_err:
+	t4_iq_free(adap, adap->mbox, adap->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
+		   iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff);
+fl_nomem:
+	ret = -ENOMEM;
+err:
+	iq->cntxt_id = 0;
+	iq->abs_id = 0;
+	if (iq->desc)
+		iq->desc = NULL;
+
+	if (fl && fl->desc) {
+		rte_free(fl->sdesc);
+		fl->cntxt_id = 0;
+		fl->sdesc = NULL;
+		fl->desc = NULL;
+	}
+	return ret;
+}
+
+static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
+{
+	q->cntxt_id = id;
+	q->bar2_addr = bar2_address(adap, q->cntxt_id, T4_BAR2_QTYPE_EGRESS,
+				    &q->bar2_qid);
+	q->cidx = 0;
+	q->pidx = 0;
+	q->dbidx = 0;
+	q->in_use = 0;
+	q->equeidx = 0;
+	q->coalesce.idx = 0;
+	q->coalesce.len = 0;
+	q->coalesce.flits = 0;
+	q->stat = (void *)&q->desc[q->size];
+}
+
+int t4_sge_eth_txq_start(struct sge_eth_txq *txq)
+{
+	/*
+	 *  TODO: For flow-control, queue may be stopped waiting to reclaim
+	 *  credits.
+	 *  Ensure queue is in EQ_STOPPED state before starting it.
+	 */
+	if (!(txq->flags & EQ_STOPPED))
+		return -(EBUSY);
+
+	txq->flags &= ~EQ_STOPPED;
+
+	return 0;
+}
+
+int t4_sge_eth_txq_stop(struct sge_eth_txq *txq)
+{
+	txq->flags |= EQ_STOPPED;
+
+	return 0;
+}
+
+int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
+			 struct rte_eth_dev *eth_dev, uint16_t queue_id,
+			 unsigned int iqid, int socket_id)
+{
+	int ret, nentries;
+	struct fw_eq_eth_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = (struct port_info *)(eth_dev->data->dev_private);
+	char z_name[RTE_MEMZONE_NAMESIZE];
+	char z_name_sw[RTE_MEMZONE_NAMESIZE];
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
+		 eth_dev->driver->pci_drv.name, "tx_ring",
+		 eth_dev->data->port_id, queue_id);
+	snprintf(z_name_sw, sizeof(z_name_sw), "%s_sw_ring", z_name);
+
+	txq->q.desc = alloc_ring(txq->q.size, sizeof(struct tx_desc),
+				 sizeof(struct tx_sw_desc), &txq->q.phys_addr,
+				 &txq->q.sdesc, s->stat_len, queue_id,
+				 socket_id, z_name, z_name_sw);
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
+			    F_FW_CMD_WRITE | F_FW_CMD_EXEC |
+			    V_FW_EQ_ETH_CMD_PFN(adap->pf) |
+			    V_FW_EQ_ETH_CMD_VFN(0));
+	c.alloc_to_len16 = htonl(F_FW_EQ_ETH_CMD_ALLOC |
+				 F_FW_EQ_ETH_CMD_EQSTART | (sizeof(c) / 16));
+	c.autoequiqe_to_viid = htonl(F_FW_EQ_ETH_CMD_AUTOEQUEQE |
+				     V_FW_EQ_ETH_CMD_VIID(pi->viid));
+	c.fetchszm_to_iqid =
+		htonl(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
+		      V_FW_EQ_ETH_CMD_PCIECHN(pi->tx_chan) |
+		      F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(iqid));
+	c.dcaen_to_eqsize =
+		htonl(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
+		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
+		      V_FW_EQ_ETH_CMD_EQSIZE(nentries));
+	c.eqaddr = rte_cpu_to_be_64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
+	if (ret) {
+		rte_free(txq->q.sdesc);
+		txq->q.sdesc = NULL;
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	init_txq(adap, &txq->q, G_FW_EQ_ETH_CMD_EQID(ntohl(c.eqid_pkd)));
+	txq->stats.tso = 0;
+	txq->stats.pkts = 0;
+	txq->stats.tx_cso = 0;
+	txq->stats.coal_wr = 0;
+	txq->stats.vlan_ins = 0;
+	txq->stats.tx_bytes = 0;
+	txq->stats.coal_pkts = 0;
+	txq->stats.mapping_err = 0;
+	txq->flags |= EQ_STOPPED;
+	txq->eth_dev = eth_dev;
+	return 0;
+}
+
+static void free_txq(struct sge_txq *q)
+{
+	q->cntxt_id = 0;
+	q->sdesc = NULL;
+	q->desc = NULL;
+}
+
+static void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq,
+			 struct sge_fl *fl)
+{
+	unsigned int fl_id = fl ? fl->cntxt_id : 0xffff;
+
+	t4_iq_free(adap, adap->mbox, adap->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
+		   rq->cntxt_id, fl_id, 0xffff);
+	rq->cntxt_id = 0;
+	rq->abs_id = 0;
+	rq->desc = NULL;
+
+	if (fl) {
+		free_rx_bufs(fl, fl->avail);
+		rte_free(fl->sdesc);
+		fl->sdesc = NULL;
+		fl->cntxt_id = 0;
+		fl->desc = NULL;
+	}
+}
+
+/*
+ * Clear all queues of the port
+ *
+ * Note:  This function must only be called after rx and tx path
+ * of the port have been disabled.
+ */
+void t4_sge_eth_clear_queues(struct port_info *pi)
+{
+	int i;
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *rxq = &adap->sge.ethrxq[pi->first_qset];
+	struct sge_eth_txq *txq = &adap->sge.ethtxq[pi->first_qset];
+
+	for (i = 0; i < pi->n_rx_qsets; i++, rxq++) {
+		if (rxq->rspq.desc)
+			t4_sge_eth_rxq_stop(adap, &rxq->rspq);
+	}
+	for (i = 0; i < pi->n_tx_qsets; i++, txq++) {
+		if (txq->q.desc) {
+			struct sge_txq *q = &txq->q;
+
+			t4_sge_eth_txq_stop(txq);
+			reclaim_completed_tx(q);
+			free_tx_desc(q, q->size);
+			q->equeidx = q->pidx;
+		}
+	}
+}
+
+void t4_sge_eth_rxq_release(struct adapter *adap, struct sge_eth_rxq *rxq)
+{
+	if (rxq->rspq.desc) {
+		t4_sge_eth_rxq_stop(adap, &rxq->rspq);
+		free_rspq_fl(adap, &rxq->rspq, rxq->fl.size ? &rxq->fl : NULL);
+	}
+}
+
+void t4_sge_eth_txq_release(struct adapter *adap, struct sge_eth_txq *txq)
+{
+	if (txq->q.desc) {
+		t4_sge_eth_txq_stop(txq);
+		reclaim_completed_tx(&txq->q);
+		t4_eth_eq_free(adap, adap->mbox, adap->pf, 0, txq->q.cntxt_id);
+		free_tx_desc(&txq->q, txq->q.size);
+		rte_free(txq->q.sdesc);
+		free_txq(&txq->q);
+	}
+}
+
+/**
+ * t4_free_sge_resources - free SGE resources
+ * @adap: the adapter
+ *
+ * Frees resources used by the SGE queue sets.
+ */
+void t4_free_sge_resources(struct adapter *adap)
+{
+	int i;
+	struct sge_eth_rxq *rxq = &adap->sge.ethrxq[0];
+	struct sge_eth_txq *txq = &adap->sge.ethtxq[0];
+
+	/* clean up Ethernet Tx/Rx queues */
+	for (i = 0; i < adap->sge.max_ethqsets; i++, rxq++, txq++) {
+		/* Free only the queues allocated */
+		if (rxq->rspq.desc) {
+			t4_sge_eth_rxq_release(adap, rxq);
+			rxq->rspq.eth_dev = NULL;
+		}
+		if (txq->q.desc) {
+			t4_sge_eth_txq_release(adap, txq);
+			txq->eth_dev = NULL;
+		}
+	}
+
+	if (adap->sge.fw_evtq.desc)
+		free_rspq_fl(adap, &adap->sge.fw_evtq, NULL);
+}
+
+/**
+ * t4_sge_init - initialize SGE
+ * @adap: the adapter
+ *
+ * Performs SGE initialization needed every time after a chip reset.
+ * We do not initialize any of the queues here, instead the driver
+ * top-level must request those individually.
+ *
+ * Called in two different modes:
+ *
+ *  1. Perform actual hardware initialization and record hard-coded
+ *     parameters which were used.  This gets used when we're the
+ *     Master PF and the Firmware Configuration File support didn't
+ *     work for some reason.
+ *
+ *  2. We're not the Master PF or initialization was performed with
+ *     a Firmware Configuration File.  In this case we need to grab
+ *     any of the SGE operating parameters that we need to have in
+ *     order to do our job and make sure we can live with them ...
+ */
+static int t4_sge_init_soft(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu;
+	u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5;
+	u32 ingress_rx_threshold;
+
+	/*
+	 * Verify that CPL messages are going to the Ingress Queue for
+	 * process_responses() and that only packet data is going to the
+	 * Free Lists.
+	 */
+	if ((t4_read_reg(adap, A_SGE_CONTROL) & F_RXPKTCPLMODE) !=
+	    V_RXPKTCPLMODE(X_RXPKTCPLMODE_SPLIT)) {
+		dev_err(adap, "bad SGE CPL MODE\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Validate the Host Buffer Register Array indices that we want to
+	 * use ...
+	 *
+	 * XXX Note that we should really read through the Host Buffer Size
+	 * XXX register array and find the indices of the Buffer Sizes which
+	 * XXX meet our needs!
+	 */
+#define READ_FL_BUF(x) \
+	t4_read_reg(adap, A_SGE_FL_BUFFER_SIZE0 + (x) * sizeof(u32))
+
+	fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF);
+	fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF);
+	fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF);
+	fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF);
+
+	/*
+	 * We only bother using the Large Page logic if the Large Page Buffer
+	 * is larger than our Page Size Buffer.
+	 */
+	if (fl_large_pg <= fl_small_pg)
+		fl_large_pg = 0;
+
+#undef READ_FL_BUF
+
+	/*
+	 * The Page Size Buffer must be exactly equal to our Page Size and the
+	 * Large Page Size Buffer should be 0 (per above) or a power of 2.
+	 */
+	if (fl_small_pg != PAGE_SIZE ||
+	    (fl_large_pg & (fl_large_pg - 1)) != 0) {
+		dev_err(adap, "bad SGE FL page buffer sizes [%d, %d]\n",
+			fl_small_pg, fl_large_pg);
+		return -EINVAL;
+	}
+	if (fl_large_pg)
+		s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT;
+
+	if (adap->use_unpacked_mode) {
+		int err = 0;
+
+		if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap)) {
+			dev_err(adap, "bad SGE FL small MTU %d\n",
+				fl_small_mtu);
+			err = -EINVAL;
+		}
+		if (fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) {
+			dev_err(adap, "bad SGE FL large MTU %d\n",
+				fl_large_mtu);
+			err = -EINVAL;
+		}
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Retrieve our RX interrupt holdoff timer values and counter
+	 * threshold values from the SGE parameters.
+	 */
+	timer_value_0_and_1 = t4_read_reg(adap, A_SGE_TIMER_VALUE_0_AND_1);
+	timer_value_2_and_3 = t4_read_reg(adap, A_SGE_TIMER_VALUE_2_AND_3);
+	timer_value_4_and_5 = t4_read_reg(adap, A_SGE_TIMER_VALUE_4_AND_5);
+	s->timer_val[0] = core_ticks_to_us(adap,
+					   G_TIMERVALUE0(timer_value_0_and_1));
+	s->timer_val[1] = core_ticks_to_us(adap,
+					   G_TIMERVALUE1(timer_value_0_and_1));
+	s->timer_val[2] = core_ticks_to_us(adap,
+					   G_TIMERVALUE2(timer_value_2_and_3));
+	s->timer_val[3] = core_ticks_to_us(adap,
+					   G_TIMERVALUE3(timer_value_2_and_3));
+	s->timer_val[4] = core_ticks_to_us(adap,
+					   G_TIMERVALUE4(timer_value_4_and_5));
+	s->timer_val[5] = core_ticks_to_us(adap,
+					   G_TIMERVALUE5(timer_value_4_and_5));
+
+	ingress_rx_threshold = t4_read_reg(adap, A_SGE_INGRESS_RX_THRESHOLD);
+	s->counter_val[0] = G_THRESHOLD_0(ingress_rx_threshold);
+	s->counter_val[1] = G_THRESHOLD_1(ingress_rx_threshold);
+	s->counter_val[2] = G_THRESHOLD_2(ingress_rx_threshold);
+	s->counter_val[3] = G_THRESHOLD_3(ingress_rx_threshold);
+
+	return 0;
+}
+
+int t4_sge_init(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	u32 sge_control, sge_control2, sge_conm_ctrl;
+	unsigned int ingpadboundary, ingpackboundary;
+	int ret, egress_threshold;
+
+	/*
+	 * Ingress Padding Boundary and Egress Status Page Size are set up by
+	 * t4_fixup_host_params().
+	 */
+	sge_control = t4_read_reg(adap, A_SGE_CONTROL);
+	s->pktshift = G_PKTSHIFT(sge_control);
+	s->stat_len = (sge_control & F_EGRSTATUSPAGESIZE) ? 128 : 64;
+
+	/*
+	 * T4 uses a single control field to specify both the PCIe Padding and
+	 * Packing Boundary.  T5 introduced the ability to specify these
+	 * separately.  The actual Ingress Packet Data alignment boundary
+	 * within Packed Buffer Mode is the maximum of these two
+	 * specifications.
+	 */
+	ingpadboundary = 1 << (G_INGPADBOUNDARY(sge_control) +
+			 X_INGPADBOUNDARY_SHIFT);
+	s->fl_align = ingpadboundary;
+
+	if (!is_t4(adap->params.chip) && !adap->use_unpacked_mode) {
+		/*
+		 * T5 has a weird interpretation of one of the PCIe Packing
+		 * Boundary values.  No idea why ...
+		 */
+		sge_control2 = t4_read_reg(adap, A_SGE_CONTROL2);
+		ingpackboundary = G_INGPACKBOUNDARY(sge_control2);
+		if (ingpackboundary == X_INGPACKBOUNDARY_16B)
+			ingpackboundary = 16;
+		else
+			ingpackboundary = 1 << (ingpackboundary +
+					  X_INGPACKBOUNDARY_SHIFT);
+
+		s->fl_align = max(ingpadboundary, ingpackboundary);
+	}
+
+	ret = t4_sge_init_soft(adap);
+	if (ret < 0) {
+		dev_err(adap, "%s: t4_sge_init_soft failed, error %d\n",
+			__func__, -ret);
+		return ret;
+	}
+
+	/*
+	 * A FL with <= fl_starve_thres buffers is starving and a periodic
+	 * timer will attempt to refill it.  This needs to be larger than the
+	 * SGE's Egress Congestion Threshold.  If it isn't, then we can get
+	 * stuck waiting for new packets while the SGE is waiting for us to
+	 * give it more Free List entries.  (Note that the SGE's Egress
+	 * Congestion Threshold is in units of 2 Free List pointers.)  For T4,
+	 * there was only a single field to control this.  For T5 there's the
+	 * original field which now only applies to Unpacked Mode Free List
+	 * buffers and a new field which only applies to Packed Mode Free List
+	 * buffers.
+	 */
+	sge_conm_ctrl = t4_read_reg(adap, A_SGE_CONM_CTRL);
+	if (is_t4(adap->params.chip) || adap->use_unpacked_mode)
+		egress_threshold = G_EGRTHRESHOLD(sge_conm_ctrl);
+	else
+		egress_threshold = G_EGRTHRESHOLDPACKING(sge_conm_ctrl);
+	s->fl_starve_thres = 2 * egress_threshold + 1;
+
+	return 0;
+}