new file mode 100644
@@ -0,0 +1,962 @@
+=====================================
+Flow API support in TAP PMD, using TC
+=====================================
+
+.. contents::
+.. sectnum::
+
+.. footer::
+
+ v0.8 - page ###Page###
+
+.. raw:: pdf
+
+ PageBreak
+
+Rationale
+=========
+
+For this project, the tap PMD has to receive selected traffic from a different
+netdevice (refer to *VM migration with Microsoft Hyper-V and Mellanox
+ConnectX-3* document) and only cover the same set of rules as supported by the
+mlx4 PMD.
+
+The DPDK traffic classifier is the rte_flow API, and the tap PMD must therefore
+implement it. For that, TC was chosen for several reasons:
+
+- it happens very early in the kernel stack for ingress (faster than netfilter).
+- it supports dropping packets given a specific flow.
+- it supports redirecting packets to a different netdevice.
+- it has a "flower" classifier type that meets mostly the pattern items in
+ rte_flow.
+- it can be configured through a netlink socket, without an external tool.
+
+Modes of operation
+==================
+
+There should be two modes of operation for the tap PMD regarding rte_flow:
+*local* and *remote*. Only one mode can be in use at a time for a specific tap
+interface.
+
+The *local* mode would be the default one, if no specific parameter is specified
+in the command line. To start the application with tap in *remote* mode, set the
+``remote`` tap parameter to the interface you want to redirect packets from,
+e.g.::
+
+ testpmd -n 4 -c 0xf -m 1024 --vdev=net_tap,iface=tap0,remote=eth3 -- \
+ -i --burst=64 --coremask=0x2
+
+*Local* mode
+------------
+
+In *local* mode, flow rules would be applied as-is, on the tap netdevice itself
+(e.g.: ``tap0``).
+
+The typical use-case is having a linux program (e.g. a webserver) communicating
+with the DPDK app through the tap netdevice::
+
+ +-------------------------+
+ | DPDK application |
+ +-------------------------+
+ | ^
+ | rte_flow rte_flow |
+ v egress ingress |
+ +-------------------------+
+ | Tap PMD |
+ +-------------------------+
+ | ^
+ | TC TC |
+ v ingress egress |
+ +-------------------------+ +-------------------------+
+ | |<-------------| |
+ | Tap netdevice (tap0) | | Linux app (webserver) |
+ | |------------->| |
+ +-------------------------+ +-------------------------+
+
+.. raw:: pdf
+
+ PageBreak
+
+*Remote* mode
+-------------
+
+In *remote* mode, flow rules would be applied on the tap netdevice (e.g.:
+``tap0``), and use a similar match to redirect specific packets from another
+netdevice (e.g.: ``eth3``, a NetVSC netdevice in our project scenario)::
+
+ +-------------------------+
+ | DPDK application |
+ +-------------------------+
+ | ^
+ | rte_flow rte_flow |
+ v egress ingress |
+ +-------------------------+
+ | Tap PMD |
+ +-------------------------+
+ | ^
+ | TC TC |
+ v ingress egress |
+ +-------------------------+ +-------------------------+
+ | |<------------------redirection-------\ |
+ | Tap netdevice (tap0) | | | |
+ | |------------->|-\ eth3 | |
+ +-------------------------+ +--|--------------------|-+
+ | TC TC ^
+ | egress ingress |
+ v |
+
+.. raw:: pdf
+
+ PageBreak
+
+rte_flow rules conversion
+=========================
+
+Netlink
+-------
+
+The only way to create TC rules in the kernel is through netlink messages.
+Two possibilities arise for managing TC rules:
+
+- Using native netlink API calls in the tap PMD
+- Calling the ``tc`` command from iproute2 inside our PMD, via ``system()``.
+
+The former will be done, as library calls are faster than changing context and
+executing an external program from within the tap PMD. Moreover, the kernel TC
+API might propose features not yet implemented in iproute2. Furthermore, a
+custom implementation enables finer tuning and better control.
+
+..
+ Some implementations for TC configuration through Netlink exist already. It's a
+ good source of inspiration on how to do it:
+
+ - iproute2's tc `source code`__
+ - ovs's tc implementation__ (not yet upstream)
+
+ __ https://github.com/shemminger/iproute2/tree/master/tc
+ __ https://mail.openvswitch.org/pipermail/ovs-dev/2016-November/324693.html
+
+Conversion examples
+-------------------
+
+Here are a few examples of rules and how they can be translated from rte_flow
+rules to TC rules. rte_flow rules will be expressed using testpmd's ``flow``
+command syntax, while TC rules will use iproute2 ``tc`` command syntax.
+
+**Notes**:
+ - rte_flow ``ingress`` direction can be translated into a TC ``egress`` rule,
+ and vice versa, when it applies to a tap interface, as TC considers the
+ kernel netdevice standpoint.
+ - in TC, redirecting a packet works by taking a packet from ``ingress`` and
+ sending to another device's ``egress``.
+
+*Local* mode
+~~~~~~~~~~~~
+
+#. Flow rule to give packets coming on the ``tap0`` interface to RX queue 0:
+
+ Using rte_flow::
+
+ flow validate 0 ingress pattern port index is 0 / end \
+ actions queue index 0 / end
+
+ Using ``tc``::
+
+ tc filter add dev tap0 parent 1: flower indev tap0 \
+ action skbedit queue_mapping 0
+
+#. Flow rule to get packets with source mac ``de:ad:ca:fe:00:02`` on RX queue 2:
+
+ Using rte_flow::
+
+ flow create 0 ingress pattern eth src is de:ad:ca:fe:00:02 / end \
+ actions queue 2 / end
+
+ Using ``tc``::
+
+ tc filter add dev tap0 parent 1: flower src_mac de:ad:ca:fe:00:02 \
+ action skbedit queue_mapping 2
+
+#. Flow rule to drop packets matching specific 5-tuple info:
+
+ Using rte_flow::
+
+ flow create 0 ingress pattern eth dst is 3a:80:ce:61:36:54 \
+ src is 52:43:7b:fd:ac:f3 / ipv4 src is 1.1.1.1 dst is 2.2.2.2 \
+ / udp src is 4444 dst is 5555 / end actions drop / end
+
+ Using ``tc``::
+
+ tc filter add dev tap0 parent 1: flower dst_mac 3a:80:ce:61:36:54 \
+ src_mac 52:43:7b:fd:ac:f3 eth_type ip src_ip 1.1.1.1 dst_ip 2.2.2.2 \
+ ip_proto udp src_port 4444 dst_port 5555 action drop
+
+*Remote* mode
+~~~~~~~~~~~~~
+
+In *remote* mode, an additional rule for redirecting packet is systematically
+required. The examples are similar to the previous section (the rte_flow rule
+will thus be omitted).
+
+#. TC rules to give packets coming on the ``eth3`` interface to ``tap0`` RX
+ queue 0::
+
+ # redirection rule
+ tc filter add dev eth3 parent ffff: flower indev eth3 \
+ action mirred egress redirect dev tap0
+ # actual tap rule
+ tc filter add dev tap0 parent 1: flower indev tap0 \
+ action skbedit queue_mapping 0
+
+#. TC rules to get packets with source mac ``de:ad:ca:fe:00:02`` on RX queue 2::
+
+ # redirection rule
+ tc filter add dev eth3 parent ffff: flower src_mac de:ad:ca:fe:00:02 \
+ action mirred egress redirect dev tap0
+ # actual tap rule
+ tc filter add dev tap0 parent 1: flower src_mac de:ad:ca:fe:00:02 \
+ action skbedit queue_mapping 2
+
+#. TC rules to drop packets matching specific 5-tuple info::
+
+ # redirection rule
+ tc filter add dev eth3 parent ffff: flower dst_mac 3a:80:ce:61:36:54 \
+ src_mac 52:43:7b:fd:ac:f3 eth_type ip src_ip 1.1.1.1 dst_ip 2.2.2.2 \
+ ip_proto udp src_port 4444 dst_port 5555 \
+ action mirred egress redirect dev tap0
+ # actual tap rule
+ tc filter add dev tap0 parent 1: flower dst_mac 3a:80:ce:61:36:54 \
+ src_mac 52:43:7b:fd:ac:f3 eth_type ip src_ip 1.1.1.1 dst_ip 2.2.2.2 \
+ ip_proto udp src_port 4444 dst_port 5555 action drop
+
+One last thing, to redirect packets the other way around (from ``tap0`` to
+``eth3``), we would use a similar rule, exchanging interfaces and using an
+appropriate match, e.g.::
+
+ tc filter add dev tap0 parent ffff: flower indev tap0 \
+ action mirred egress redirect dev eth3
+
+..
+ **Note:** ``parent ffff:`` is for TC ``ingress`` while ``parent 1:`` is for TC
+ ``egress``.
+
+Broadcast and promiscuous support
++++++++++++++++++++++++++++++++++
+
+*Remote* mode requirements:
+
+#. When turning the tap netdevice promiscuous, the remote netdevice should
+ implicitly be turned promiscuous too, to get as many packets as possible.
+
+#. Packets matching the destination MAC configured in the tap PMD should be
+ redirected from the remote without being processed by the stack there in the
+ kernel.
+
+#. In promiscuous mode, an incoming packet should be duplicated to be processed
+ both by the tap PMD and the remote netdevice itself.
+
+#. Incoming packets with broadcast destination MAC (i.e.: ``ff:ff:ff:ff:ff:ff``)
+ should be duplicated to be processed both by the tap PMD and the remote
+ netdevice itself.
+
+#. Incoming packets with IPv6 multicast destination MAC (i.e.:
+ ``33:33:00:00:00:00/33:33:00:00:00:00``) should be duplicated to be processed
+ both by the tap PMD and the remote netdevice itself.
+
+#. Incoming packets with broadcast/multicast bit set in the destination MAC
+ (i.e.: ``01:00:00:00:00:00/01:00:00:00:00:00``) should be duplicated to be
+ processed both by the tap PMD and the remote netdevice itself.
+
+Each of these requirements (except the first one) can be directly translated
+into a TC rule, e.g.::
+
+ # local mac (notice the REDIRECT for mirred action):
+ tc filter add dev eth3 parent ffff: prio 1 flower dst_mac de:ad:be:ef:01:02 \
+ action mirred egress redirect dev tap0
+
+ # tap promisc:
+ tc filter add dev eth3 parent ffff: prio 2 basic \
+ action mirred egress mirror dev tap0
+
+ # broadcast:
+ tc filter add dev eth3 parent ffff: prio 3 flower dst_mac ff:ff:ff:ff:ff:ff \
+ action mirred egress mirror dev tap0
+
+ # broadcast v6 (can't express mac_mask with tc, but it works via netlink):
+ tc filter add dev eth3 parent ffff: prio 4 flower dst_mac 33:33:00:00:00:00 \
+ action mirred egress mirror dev tap0
+
+ # all_multi (can't express mac_mask with tc, but it works via netlink):
+ tc filter add dev eth3 parent ffff: prio 5 flower dst_mac 01:00:00:00:00:00 \
+ action mirred egress mirror dev tap0
+
+When promiscuous mode is switched off or on, the first TC rule will be modified
+to have respectively an empty action (``continue``) or the ``mirror`` action.
+
+The first 5 priorities are always reserved, and can only be used for these
+filters.
+
+On top of that, the tap PMD can configure explicit rte_flow rules, translated as
+TC rules on both the remote netdevice and the tap netdevice. On the remote,
+those would need to be processed after the default rules handling promiscuous
+mode, broadcast and all_multi packets.
+
+When using the ``mirror`` action, the packet is duplicated and sent to the tap
+netdevice, while the original packet gets directly processed by the kernel
+without going through later TC rules for the remote. On the tap netdevice, the
+duplicated packet will go through tap TC rules and be classified depending on
+those rules.
+
+**Note:** It is possible to combine a ``mirror`` action and a ``continue``
+action for a single TC rule. Then the original packet would undergo remaining TC
+rules on the remote netdevice side.
+
+When using the ``redirect`` action, the behavior is similar on the tap side, but
+the packet is not duplicated, no further kernel processing is done for the
+remote side.
+
+The following diagram sums it up. A packet that match a TC rule follows the
+associated action (the number in the diamond represents the rule prio as set in
+the above TC rules)::
+
+
+ Incoming packet |
+ on remote (eth3) |
+ | Going through
+ | TC ingress rules
+ v
+ / \
+ / 5 \
+ / \ yes
+ / mac \____________________> tap0
+ \ match?/ duplicated pkt
+ \ /
+ \ /
+ \ /
+ V no, then continue
+ | with TC rules
+ |
+ v
+ / \
+ / 2 \
+ eth3 yes / \ yes
+ kernel <____________________ /promisc\____________________> tap0
+ stack original pkt \ match?/ duplicated pkt
+ \ /
+ \ /
+ \ /
+ V no, then continue
+ | with TC rules
+ |
+ v
+ / \
+ / 3 \
+ eth3 yes / \ yes
+ kernel <____________________ / bcast \____________________> tap0
+ stack original pkt \ match?/ duplicated pkt
+ \ /
+ \ /
+ \ /
+ V no, then continue
+ | with TC rules
+ |
+ v
+ / \
+ / 4 \
+ eth3 yes / \ yes
+ kernel <____________________ / bcast6\____________________> tap0
+ stack original pkt \ match?/ duplicated pkt
+ \ /
+ \ /
+ \ /
+ V no, then continue
+ | with TC rules
+ |
+ v
+ / \
+ / 5 \
+ eth3 yes / all \ yes
+ kernel <____________________ / multi \____________________> tap0
+ stack original pkt \ match?/ duplicated pkt
+ \ /
+ \ /
+ \ /
+ V no, then continue
+ | with TC rules
+ |
+ v
+ |
+ . remaining TC rules
+ .
+ eth3 |
+ kernel <________________________/
+ stack original pkt
+
+.. raw:: pdf
+
+ PageBreak
+
+Associating an rte_flow rule with a TC one
+==========================================
+
+A TC rule is identified by a ``priority`` (16-bit value) and a ``handle``
+(32-bit value). To delete a rule, the priority must be specified, and if several
+rules have the same priority, the handle is needed to select the correct one.
+
+..
+ Specifying an empty priority and handle when requesting a TC rule creation will
+ let the kernel automatically decide what values to set. In fact, the kernel will
+ start with a high priority (i.e. 49152) and subsequent rules will get decreasing
+ priorities (lower priorites get evaluated first).
+
+To avoid further requests to the kernel to identify what priority/handle has
+been automatically allocated, the tap PMD can set priorities and handles
+systematically when creating a rule.
+
+In *local* mode, an rte_flow rule should be translated into a single TC flow
+identified by priority+handle.
+
+In *remote* mode, an rte_flow rule requires two TC rules, one on the tap
+netdevice itself (for the correct action) and another one on the other netdevice
+where packets are redirected from. Both TC rules' priorities+handles must be
+stored for a specific rte_flow rule, and associated with the device they are
+applied on.
+
+.. raw:: pdf
+
+ PageBreak
+
+Considerations regarding Flow API support
+=========================================
+
+Flow rule attributes
+--------------------
+
+Groups and priorities:
+ There is no native support of groups in TC. Instead, the priority field
+ (which is part of the netlink TC msg header) can be adapted. The four MSB
+ would be used to define the group (allowing for 16 groups), while the 12 LSB
+ would be left to define the actual priority (up to 4096).
+
+ Rules with lower priorities are evaluated first. For rules with identical
+ priorities, the one with the highest handle value gets evaluated first.
+
+Direction:
+ Both ingress and egress filtering can be supported.
+
+Meta item types
+---------------
+
+Most applications will use: ``(END | VOID)``
+
+END, VOID:
+ Supported without problem.
+
+INVERT:
+ There is no easy way to support that in TC. It won't be supported
+
+ **mlx4 will not support it either.**
+
+PF, VF, PORT:
+ Not applicable to a tap netdevice.
+
+Data matching item types
+------------------------
+
+Most applications will use:
+``ETH / (IPV4 | IPV6 | END) / (TCP | UDP | END) / END``
+
+ANY:
+ Should be supported.
+
+ **mlx4 will partially support it.**
+
+RAW:
+ It is not planned to support it for now. Matching Raw packets would require
+ using a different classifier than "flower", which is the most simple and
+ applicable for otherwise most other cases. With TC, it's not possible to
+ support in the same rule both "flower" and raw packets.
+
+ **mlx4 will not support it either**.
+
+VLAN:
+ Matching VLAN ID and prio supported.
+ **Note: linux v4.9 required for VLAN support.**
+
+ETH, IPV4, IPV6, UDP, TCP:
+ Matching source/destination MAC/IP/port is supported, with masks.
+
+ **mlx4 does not support partial bit-masks (full or zeroed only).**
+
+ICMP:
+ By specifying the appropriate ether type, ICMP packets can be matched.
+ However, there is no support for ICMP type or code.
+
+ **mlx4 will not support it, however.**
+
+SCTP:
+ By specifying the appropriate IP protocol, SCTP packets can be matched.
+ However, no specific SCTP fields can be matched.
+
+ **mlx4 will not support it, however.**
+
+VXLAN:
+ VXLAN is not recognized by the "flower" classifier. Kernel-managed VXLAN
+ traffic would come through an additional netdevice, which falls outside
+ the scope of this project. VXLAN traffic should occur outside VMs anyway.
+
+Action types
+------------
+
+Most applications will use: ``(VOID | END | QUEUE | DROP) / END``
+
+By default, multiple actions are possible for TC flow rules. However, they are
+ordered in the kernel. The implementation will need to handle actions in a way
+that orders them intelligently when creating them.
+
+VOID, END:
+ Supported.
+
+PASSTHRU:
+ The generic "continue" action can be used.
+
+ **mlx4 will not support it, however**.
+
+MARK / FLAG:
+ The mark is a field inside an skbuff. However, the tap reads messages (mostly
+ packet data), without that info. As an alternative, it may be possible to
+ create a specific queue to pass packets with a specific mark. Further testing
+ are needed to ensure it is feasable.
+
+QUEUE:
+ The ``skbedit`` action with the ``queue_mapping`` option enables directing
+ packets to specific queues.
+
+ Like rte_flow, specifying several ``skbedit queue_mapping`` actions in TC
+ only considers the last one.
+
+DROP:
+ The generic "drop" action can be used. Packets will effectively be dropped,
+ and not left for the kernel to process.
+
+COUNT: Stats are automatically stored in the kernel. The COUNT action will thus
+ be ignored when creating the rule. ``rte_flow_query()`` can be implemented
+ to request a rule's stats from the kernel.
+
+DUP:
+ Duplicating packets is not supported.
+
+RSS:
+ There's no built-in mechanism for RSS in TC.
+
+ By default, incoming packets go to the tap PMD queue 0. To support RSS in
+ software, several additional queues must be set up. Packets coming in on
+ queue 0 can be considered as requiring RSS, and the PMD will apply software
+ rss (using something like ``rte_softrss()``) to select a queue for the
+ packet.
+
+PF, VF:
+ Not applicable to a tap netdevice.
+
+.. raw:: pdf
+
+ PageBreak
+
+TC limitations for flow collision
+=================================
+
+From TC standpoint, filter rules with identical priorities do not collide, if
+they do specify values (at least one different) for the same fields in the TC
+message, with identical fields masks.
+
+Unfortunately, some flows that obviously are not colliding can be considered
+otherwise by the kernel when parsing the TC messages, and thus their creation
+would be rejected.
+
+Here is a table for matching TC fields with their flow API equivalent:
+
++------------------------------+-----------------------------------+-----------+
+| TC message field | rte_flow API | maskable? |
++==============================+===================================+===========+
+| TCA_FLOWER_KEY_ETH_DST | eth dst | yes |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_ETH_SRC | eth src | yes |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_ETH_TYPE | eth type is 0xZZZZ || | no |
+| | eth / {ipv4|ipv6} | |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_IP_PROTO | eth / {ipv4|ipv6} / {tcp|udp} | no |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_IPV4_SRC | eth / ipv4 src | yes |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_IPV4_DST | eth / ipv4 dst | yes |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_IPV6_SRC | eth / ipv6 src | yes |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_IPV6_DST | eth / ipv6 dst | yes |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_L4_SRC | eth / {ipv4|ipv6} / {tcp|udp} dst | no |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_L4_DST | eth / {ipv4|ipv6} / {tcp|udp} src | no |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_VLAN_ID | eth / vlan vid | no |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_VLAN_PRIO | eth / vlan pcp | no |
++------------------------------+-----------------------------------+-----------+
+| TCA_FLOWER_KEY_VLAN_ETH_TYPE | eth / vlan tpid | no |
++------------------------------+-----------------------------------+-----------+
+
+When creating rules with identical priorities, one must make sure that they
+would be translated in TC using the same fields as shown in the above table.
+
+The following flow rules can share the same priority, as they use the same
+fields with identical masks under the hood::
+
+ > flow create 0 ingress priority 0 pattern eth / ipv4 / end
+ actions drop / end
+ Flow rule #0 created
+ > flow create 0 ingress priority 0 pattern eth type is 0x86dd / end
+ actions drop / end
+ Flow rule #1 created
+
+**Note:** Both rules use ETH_TYPE (mask 0xffff) in their TC form.
+
+Sadly, the following flow rules cannot share the same priority, since fields for
+matching IPv4 and IPv6 src/dst addresses are different::
+
+ > flow create 0 ingress priority 1 pattern eth / ipv4 src is 1.1.1.1 / end
+ actions drop / end
+ Flow rule #0 created
+ > flow create 0 ingress priority 1 pattern eth / ipv6 src is ::1 / end
+ actions drop / end
+ PMD: Kernel refused TC filter rule creation (22): Invalid argument
+ Caught error type 2 (flow rule (handle)): overlapping rules
+
+**Note:** First rule uses ETH_TYPE and IPV4_SRC, while the second uses ETH_TYPE
+and IPV6_SRC.
+
+It is however possible to match different IPvX addresses with the same
+priority::
+
+ > flow create 0 ingress priority 2 pattern eth / ipv4 src is 1.1.1.1 / end
+ actions drop / end
+ Flow rule #0 created
+ > flow create 0 ingress priority 2 pattern eth / ipv4 src is 2.2.2.2 / end
+ actions drop / end
+ Flow rule #1 created
+
+If the first rule specifies both destination and source addresses, then the
+other rule with the same priority must too (with identical masks)::
+
+ > flow create 0 ingress priority 3 pattern eth / ipv4 src is 1.1.1.1
+ dst is 1.1.1.2 / end actions drop / end
+ Flow rule #0 created
+ > flow create 0 ingress priority 3 pattern eth / ipv4 src is 2.2.2.2 / end
+ actions drop / end
+ PMD: Kernel refused TC filter rule creation (22): Invalid argument
+ Caught error type 2 (flow rule (handle)): overlapping rules
+ > flow create 0 ingress priority 3 pattern eth / ipv4 src is 2.2.2.2
+ dst spec 2.2.2.3 dst mask 255.255.255.0 / end actions drop / end
+ PMD: Kernel refused TC filter rule creation (22): Invalid argument
+ Caught error type 2 (flow rule (handle)): overlapping rules
+ > flow create 0 ingress priority 3 pattern eth / ipv4 src is 2.2.2.2
+ dst is 2.2.2.3 / end actions drop / end
+ Flow rule #1 created
+
+**Note:** First rule uses ETH_TYPE, IPV4_SRC and IPV4_DST (with full masks). The
+two others must also use those to share the same priority.
+
+It is possible to match TCP/UDP packets with different ports whatever the
+underlying L3, if the same fields are used (thus no l3 addresses specification).
+For instance::
+
+ > flow create 0 ingress priority 4 pattern eth / ipv4 / tcp dst is 3333 / end
+ actions drop / end
+ Flow rule #0 created
+ > flow create 0 ingress priority 4 pattern eth / ipv6 / udp dst is 4444 / end
+ actions drop / end
+ Flow rule #1 created
+ > flow create 0 ingress priority 4 pattern eth / ipv6 / udp src is 5555 / end
+ actions drop / end
+ PMD: Kernel refused TC filter rule creation (22): Invalid argument
+ Caught error type 2 (flow rule (handle)): overlapping rules
+
+**Note:** First 2 rules use ETH_TYPE, IP_PROTO and L4_DST with different values
+but identical masks, so they're OK. Last rule used L4_SRC instead of L4_DST.
+
+.. raw:: pdf
+
+ PageBreak
+
+RSS implementation for tap
+==========================
+
+There are several areas of research for a tap RSS implementation:
+
+#. userland implementation in tap PMD
+#. userland implementation in DPDK (generic)
+#. userland implementation using combination of TC rules and BPF filters/actions
+#. kernel-side implementation in tap driver
+#. kernel-side implementation as a BPF classifier/action
+#. kernel-side implementation as a separate TC action
+
++--------------+------------------------------+------------------------------+
+| | Pros | Cons |
++==============+==============================+==============================+
+| tap PMD | - no kernel upstreaming | - tap PMD is supposed to be |
+| | | simple, and would no longer|
+| | | be. |
+| | | |
+| | | - complex rework, with many |
+| | | rings for enqueuing packets|
+| | | to the right queue |
+| | | |
+| | | - slower |
+| | | |
+| | | - won't be accepted as it |
+| | | doesn't make sense to redo |
+| | | what the kernel did |
+| | | previously |
++--------------+------------------------------+------------------------------+
+| generic DPDK | - would be useful to others | - design must be compatible |
+| | | with most PMDs |
+| | | |
+| | | - probably the longest to |
+| | | develop |
+| | | |
+| | | - requires DPDK community |
+| | | approval |
+| | | |
+| | | - requires heavy changes in |
+| | | tap PMD itself anyway |
++--------------+------------------------------+------------------------------+
+| TC rules | - no kernel upstreaming | - BPF is complicated to learn|
+| combination | | |
+| | - fast | - Runtime BPF compilation / |
+| | | or bytecode change, would |
+| | - per-flow RSS | be tricky |
+| | | |
+| | - no change in tap PMD | - much rework in the tap PMD |
+| | datapath | to handle lots of new |
+| | | netlink messages / actions |
++--------------+------------------------------+------------------------------+
+| tap driver | - pretty fast as it | - might not be accepted by |
+| | intervenes early in packet | the kernel community as |
+| | RX | they may cling to their |
+| | | jhash2 hashing function for|
+| | | RX. |
+| | | |
+| | | - only a single RSS context |
++--------------+------------------------------+------------------------------+
+| BPF | - fast | - BPF is complicated to learn|
+| classifier - | | |
+| action | - per-flow RSS | - would require changing the |
+| | | kernel API to support |
+| | | editing queue_mapping in an|
+| | | skb |
+| | | |
+| | | - hashing would be performed |
+| | | for each queue of a |
+| | | specific RSS context |
+| | | |
+| | | - probably difficult to gain |
+| | | community acceptance |
++--------------+------------------------------+------------------------------+
+| TC action | - much more flexibility, with| - needs to be in sync with |
+| | per-flow RSS, multiple | iproute2's tc program |
+| | keys, multiple packet | |
+| | fields for the hash... | - kernel upstreaming is not |
+| | | necessarily easy |
+| | - it's a separate kernel | |
+| | module that can be | - rework in tap PMD to |
+| | maintained out-of-tree and | support new RSS action and |
+| | optionally upstreamed | configuration |
+| | anytime | |
+| | | |
+| | - most logical to be handled | |
+| | in kernel as RSS is | |
+| | supposed to be computed in | |
+| | the "NIC" exactly once. | |
+| | | |
+| | - fastest | |
+| | | |
+| | - no change in tap PMD | |
+| | datapath | |
++--------------+------------------------------+------------------------------+
+
+TC rules using BPF from tap PMD
+-------------------------------
+
+The third solution is the best for userland-based solutions.
+It does the job well, fast (datapath running in kernel), is logically happening
+in the kernel in runtime, supports flow-based RSS, has the best potential to
+be accepted by the community.
+
+Advantages with this solution:
+- hash can be recorded in the packet data and read in tap PMD
+- no kernel customization, everything in DPDK
+- packet gets in tap PMD on the correct queue directly
+
+Drawbacks:
+- complicates tap PMD a lot:
+ - 3 BPF programs
+ - new implicit rules
+ - new action and filter support
+ - packet stripping
+- numerous TC rules required (in proportion with the number of queues)
+- fast (kernel + JIT BPF), but several TC rules must be crossed
+
+BPF programs controlled from tap PMD will be used to match packets, compute a
+hash given the configured key, and send packets to tap using the desired queue.
+
+Design
+~~~~~~
+
+BPF has a limited set of functions for editing the skb in TC. They are listed
+in ``linux/net/core/filter.c:tc_cls_act_func_proto()``:
+
+- skb_store_bytes
+- skb_load_bytes
+- skb_pull_data
+- csum_diff
+- csum_update
+- l3_csum_replace
+- l4_csum_replace
+- clone_redirect
+- get_cgroup_classid
+- skb_vlan_push
+- skb_vlan_pop
+- skb_change_proto
+- skb_change_type
+- skb_change_tail
+- skb_get_tunnel_key
+- skb_set_tunnel_key
+- skb_get_tunnel_opt
+- skb_set_tunnel_opt
+- redirect
+- get_route_realm
+- get_hash_recalc
+- set_hash_invalid
+- perf_event_output
+- get_smp_processor_id
+- skb_under_cgroup
+
+In a BPF program, it is typically not possible to edit the queue_mapping field
+to direct the packet in the correct queue. That part would be done by chaining a
+``skbedit queue_mapping`` action.
+
+It is not possible either to directly prepend data to a packet (appending works,
+though).
+
+A packet would go through these rules (on the local side of the tap netdevice):
+
++-----+---------------------------+----------------------------------+----------+
+|PRIO | Match | Action 1 | Action 2 |
++=====+===========================+==================================+==========+
+| 1 | marked? | skbedit queue 'mark' --> DPDK | |
++-----+---------------------------+----------------------------------+----------+
+| 2 | marked? | skbedit queue 'mark' --> DPDK | |
++-----+---------------------------+----------------------------------+----------+
+| ... | | | |
++-----+---------------------------+----------------------------------+----------+
+| x | ANY | BPF: append NULL 32bits for hash | |
+| | | | |
++-----+---------------------------+----------------------------------+----------+
+|x + 1| ACTUAL FLOW RULE 1 MATCH | ... | |
+| | | | |
++-----+---------------------------+----------------------------------+----------+
+|x + 2| ACTUAL FLOW RULE 2 MATCH | ... | |
+| | | | |
++-----+---------------------------+----------------------------------+----------+
+| ... | | | |
++-----+---------------------------+----------------------------------+----------+
+| y | FLOW RULE RSS 1 MATCH | BPF compute hash into packet |reclassify|
+| | | tailroom && set queue in skb->cb | |
++-----+---------------------------+----------------------------------+----------+
+|y + 1| FLOW RULE RSS 2 MATCH | BPF compute hash into packet |reclassify|
+| | | tailroom && set queue in skb->cb | |
++-----+---------------------------+----------------------------------+----------+
+| ... | | | |
++-----+---------------------------+----------------------------------+----------+
+| z | ANY (default RSS) | BPF compute hash into packet |reclassify|
+| | | tailroom && set queue in skb->cb | |
++-----+---------------------------+----------------------------------+----------+
+| z | ANY (isolate mode) | DROP | |
++-----+---------------------------+----------------------------------+----------+
+
+
+
+TC kernel action
+----------------
+
+The latest solution (implementing a TC action) would probably be the most simple
+to implement. It is also very flexible, opening more possibilities for filtering
+and RSS combined.
+
+For this solution, the following parameters could be used to configure RSS in a
+TC netlink message:
+
+``queues`` (u16 \*):
+ list of queues to spread incoming traffic on. That's actually the reta.
+ **Note:** the queue in an ``skb`` is on 16-bits, hence the type here.
+
+``key`` (u8 \*):
+ key to use for the Toeplitz-hash in this flow.
+
+``hash_fields`` (bitfield):
+ similar to what's in DPDK, the bitfield should determine what fields in the
+ packet header to use for hashing. It is likely another means of configuring
+ which fields to pick would be used actually.
+
+``algo`` (unsigned):
+ an enum value from the kernel act_rss header can be used to determine which
+ algorithm (implemented in the kernel) to use. Possible algos could be
+ toeplitz, xor, symmetric hash...
+
+**Note:** The number of queues to use is automatically deduced from the
+``queues`` netlink attribute length. The ``key`` length can be similarly
+obtained.
+
+.. raw:: pdf
+
+ PageBreak
+
+Appendix: TC netlink message
+============================
+
+**Note:** For deterministic behavior, TC queueing disciplines (QDISC), filters
+and classes must be flushed before starting to apply TC rules. There is a little
+bit of boilerplate (with specific netlink messages) to ensure TC rules can be
+applied. Typically, the TC ``ingress`` QDISC must be created first.
+
+For information, netlink messages regarding TC will look like this::
+
+ 0 8 16 24 32
+ +----------+----------+----------+----------+ ---
+ 0 | Length | \
+ +---------------------+---------------------+ \
+ 4 | Type | Flags | |
+ +----------- ---------+---------------------+ >-- struct
+ 8 | Sequence number | | nlmsghdr
+ +-------------------------------------------+ /
+ 12 | Process Port ID (PID) | /
+ +==========+==========+==========+==========+ ---
+ 16 | Family | Rsvd1 | Reserved2 | \
+ +----------+----------+---------------------+ \
+ 20 | Interface index | |
+ +-------------------------------------------+ |
+ 24 | Handle | |
+ +-------------------------------------------+ >-- struct
+ 28 | Parent handle | | tcmsg
+ | MAJOR + MINOR | |
+ +-------------------------------------------+ |
+ 32 | TCM info | /
+ | priority + protocol | /
+ +===========================================+ ---
+ | |
+ | Payload |
+ | |
+ ........................................
+ | |
+ | |
+ +-------------------------------------------+
@@ -39,6 +39,9 @@ EXPORT_MAP := rte_pmd_tap_version.map
LIBABIVER := 1
+# TAP_MAX_QUEUES must be a power of 2 as it will be used for masking */
+TAP_MAX_QUEUES = 16
+
CFLAGS += -O3
CFLAGS += -I$(SRCDIR)
CFLAGS += -I.
@@ -47,6 +50,8 @@ LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
LDLIBS += -lrte_bus_vdev
+CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
+
#
# all source are stored in SRCS-y
#
@@ -89,7 +94,6 @@ tap_autoconf.h: tap_autoconf.h.new
mv '$<' '$@'
$(SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP):.c=.o): tap_autoconf.h
-
clean_tap: FORCE
$Q rm -f -- tap_autoconf.h tap_autoconf.h.new
@@ -45,7 +45,7 @@
#include <rte_ether.h>
#ifdef IFF_MULTI_QUEUE
-#define RTE_PMD_TAP_MAX_QUEUES 16
+#define RTE_PMD_TAP_MAX_QUEUES TAP_MAX_QUEUES
#else
#define RTE_PMD_TAP_MAX_QUEUES 1
#endif
@@ -90,6 +90,11 @@ struct pmd_internals {
int ioctl_sock; /* socket for ioctl calls */
int nlsk_fd; /* Netlink socket fd */
int flow_isolate; /* 1 if flow isolation is enabled */
+ int flower_support; /* 1 if kernel supports, else 0 */
+ int flower_vlan_support; /* 1 if kernel supports, else 0 */
+ int rss_enabled; /* 1 if RSS is enabled, else 0 */
+ /* implicit rules set when RSS is enabled */
+ LIST_HEAD(tap_rss_flows, rte_flow) rss_flows;
LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */
/* implicit rte_flow rules set when a remote device is active */
LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows;
@@ -43,6 +43,9 @@
#include <tap_autoconf.h>
#include <tap_tcmsgs.h>
+#include <linux/bpf.h>
+#include <linux/tc_act/tc_bpf.h>
+
#ifndef HAVE_TC_FLOWER
/*
* For kernels < 4.2, this enum is not defined. Runtime checks will be made to
@@ -104,6 +107,23 @@ struct remote_rule {
int mirred;
};
+struct action_data {
+ char id[16];
+
+ union {
+ struct tc_gact gact;
+ struct tc_mirred mirred;
+ struct skbedit {
+ struct tc_skbedit skbedit;
+ uint16_t queue;
+ } skbedit;
+ struct bpf {
+ int bpf_fd;
+ char *annotation;
+ } bpf;
+ };
+};
+
static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
@@ -134,6 +154,8 @@ tap_flow_isolate(struct rte_eth_dev *dev,
int set,
struct rte_flow_error *error);
+static int rss_enable(struct pmd_internals *pmd);
+
static const struct rte_flow_ops tap_flow_ops = {
.validate = tap_flow_validate,
.create = tap_flow_create,
@@ -816,111 +838,64 @@ tap_flow_item_validate(const struct rte_flow_item *item,
}
/**
- * Transform a DROP/PASSTHRU action item in the provided flow for TC.
- *
- * @param[in, out] flow
- * Flow to be filled.
- * @param[in] action
- * Appropriate action to be set in the TCA_GACT_PARMS structure.
- *
- * @return
- * 0 if checks are alright, -1 otherwise.
+ * FIXME
*/
static int
-add_action_gact(struct rte_flow *flow, int action)
+add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
{
struct nlmsg *msg = &flow->msg;
- size_t act_index = 1;
- struct tc_gact p = {
- .action = action
- };
- if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
- return -1;
- if (nlattr_nested_start(msg, act_index++) < 0)
+ if (nlattr_nested_start(msg, ++(*act_index)) < 0)
return -1;
- nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
- if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
- return -1;
- nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
- nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
- nlattr_nested_finish(msg); /* nested act_index */
- nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
- return 0;
-}
-
-/**
- * Transform a MIRRED action item in the provided flow for TC.
- *
- * @param[in, out] flow
- * Flow to be filled.
- * @param[in] ifindex
- * Netdevice ifindex, where to mirror/redirect packet to.
- * @param[in] action_type
- * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
- *
- * @return
- * 0 if checks are alright, -1 otherwise.
- */
-static int
-add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
-{
- struct nlmsg *msg = &flow->msg;
- size_t act_index = 1;
- struct tc_mirred p = {
- .eaction = action_type,
- .ifindex = ifindex,
- };
- if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
- return -1;
- if (nlattr_nested_start(msg, act_index++) < 0)
- return -1;
- nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
+ nlattr_add(&msg->nh, TCA_ACT_KIND, strlen(adata->id), adata->id);
if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
return -1;
- if (action_type == TCA_EGRESS_MIRROR)
- p.action = TC_ACT_PIPE;
- else /* REDIRECT */
- p.action = TC_ACT_STOLEN;
- nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
+ if (strcmp("gact", adata->id) == 0) {
+ nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
+ &adata->gact);
+ } else if (strcmp("mirred", adata->id) == 0) {
+ if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
+ adata->mirred.action = TC_ACT_PIPE;
+ else /* REDIRECT */
+ adata->mirred.action = TC_ACT_STOLEN;
+ nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(adata->mirred),
+ &adata->mirred);
+ } else if (strcmp("skbedit", adata->id) == 0) {
+ nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
+ sizeof(adata->skbedit.skbedit),
+ &adata->skbedit.skbedit);
+ nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING,
+ adata->skbedit.queue);
+ } else if (strcmp("bpf", adata->id) == 0) {
+ nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd);
+ nlattr_add(&msg->nh, TCA_ACT_BPF_NAME,
+ strlen(adata->bpf.annotation),
+ adata->bpf.annotation);
+ } else {
+ return -1;
+ }
nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
nlattr_nested_finish(msg); /* nested act_index */
- nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
return 0;
}
/**
- * Transform a QUEUE action item in the provided flow for TC.
- *
- * @param[in, out] flow
- * Flow to be filled.
- * @param[in] queue
- * Queue id to use.
- *
- * @return
- * 0 if checks are alright, -1 otherwise.
+ * FIXME
*/
static int
-add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
+ int classifier_action)
{
struct nlmsg *msg = &flow->msg;
- size_t act_index = 1;
- struct tc_skbedit p = {
- .action = TC_ACT_PIPE
- };
+ size_t act_index = 0;
+ int i;
- if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
- return -1;
- if (nlattr_nested_start(msg, act_index++) < 0)
+ if (nlattr_nested_start(msg, classifier_action) < 0)
return -1;
- nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
- if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
- return -1;
- nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
- nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
- nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
- nlattr_nested_finish(msg); /* nested act_index */
+ for (i = 0; i < nb_actions; i++)
+ if (add_action(flow, &act_index, data + i) < 0)
+ return -1;
nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
return 0;
}
@@ -1053,7 +1028,12 @@ priv_flow_process(struct pmd_internals *pmd,
}
}
if (mirred && flow) {
- uint16_t if_index = pmd->if_index;
+ struct action_data adata = {
+ .id = "mirred",
+ .mirred = {
+ .eaction = mirred,
+ },
+ };
/*
* If attr->egress && mirred, then this is a special
@@ -1061,9 +1041,13 @@ priv_flow_process(struct pmd_internals *pmd,
* redirect packets coming from the DPDK App, out
* through the remote netdevice.
*/
- if (attr->egress)
- if_index = pmd->remote_if_index;
- if (add_action_mirred(flow, if_index, mirred) < 0)
+ adata.mirred.ifindex = attr->ingress ? pmd->if_index :
+ pmd->remote_if_index;
+ if (mirred == TCA_EGRESS_MIRROR)
+ adata.mirred.action = TC_ACT_PIPE;
+ else
+ adata.mirred.action = TC_ACT_STOLEN;
+ if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
goto exit_action_not_supported;
else
goto end;
@@ -1077,14 +1061,33 @@ priv_flow_process(struct pmd_internals *pmd,
if (action)
goto exit_action_not_supported;
action = 1;
- if (flow)
- err = add_action_gact(flow, TC_ACT_SHOT);
+ if (flow) {
+ struct action_data adata = {
+ .id = "gact",
+ .gact = {
+ .action = TC_ACT_SHOT,
+ },
+ };
+
+ err = add_actions(flow, 1, &adata,
+ TCA_FLOWER_ACT);
+ }
} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
if (action)
goto exit_action_not_supported;
action = 1;
- if (flow)
- err = add_action_gact(flow, TC_ACT_UNSPEC);
+ if (flow) {
+ struct action_data adata = {
+ .id = "gact",
+ .gact = {
+ /* continue */
+ .action = TC_ACT_UNSPEC,
+ },
+ };
+
+ err = add_actions(flow, 1, &adata,
+ TCA_FLOWER_ACT);
+ }
} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
const struct rte_flow_action_queue *queue =
(const struct rte_flow_action_queue *)
@@ -1096,22 +1099,30 @@ priv_flow_process(struct pmd_internals *pmd,
if (!queue ||
(queue->index > pmd->dev->data->nb_rx_queues - 1))
goto exit_action_not_supported;
- if (flow)
- err = add_action_skbedit(flow, queue->index);
+ if (flow) {
+ struct action_data adata = {
+ .id = "skbedit",
+ .skbedit = {
+ .skbedit = {
+ .action = TC_ACT_PIPE,
+ },
+ .queue = queue->index,
+ },
+ };
+
+ err = add_actions(flow, 1, &adata,
+ TCA_FLOWER_ACT);
+ }
} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
- /* Fake RSS support. */
const struct rte_flow_action_rss *rss =
(const struct rte_flow_action_rss *)
actions->conf;
- if (action)
- goto exit_action_not_supported;
- action = 1;
- if (!rss || rss->num < 1 ||
- (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1))
+ if (action++)
goto exit_action_not_supported;
- if (flow)
- err = add_action_skbedit(flow, rss->queue[0]);
+ if (!pmd->rss_enabled)
+ err = rss_enable(pmd);
+ (void)rss;
} else {
goto exit_action_not_supported;
}
@@ -1632,6 +1643,127 @@ tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
return 0;
}
+#define BPF_PROGRAM "tap_bpf_program.o"
+
+/**
+ * Enable RSS on tap: create leading TC rules for queuing.
+ */
+static int rss_enable(struct pmd_internals *pmd)
+{
+ struct rte_flow *rss_flow = NULL;
+ char section[64];
+ struct nlmsg *msg = NULL;
+ /* 4096 is the maximum number of instructions for a BPF program */
+ char annotation[256];
+ int bpf_fd;
+ int i;
+
+ /*
+ * Add a rule per queue to match reclassified packets and direct them to
+ * the correct queue.
+ */
+ for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) {
+ struct action_data adata = {
+ .id = "skbedit",
+ .skbedit = {
+ .skbedit = {
+ .action = TC_ACT_PIPE,
+ },
+ .queue = i,
+ },
+ };
+
+ bpf_fd = 0;
+
+ rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+ if (!rss_flow) {
+ RTE_LOG(ERR, PMD,
+ "Cannot allocate memory for rte_flow");
+ return -1;
+ }
+ msg = &rss_flow->msg;
+ tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST |
+ NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+ msg->t.tcm_info = TC_H_MAKE((i + PRIORITY_OFFSET) << 16,
+ htons(ETH_P_ALL));
+ msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+ tap_flow_set_handle(rss_flow);
+ nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf");
+ if (nlattr_nested_start(msg, TCA_OPTIONS) < 0)
+ return -1;
+ nlattr_add32(&msg->nh, TCA_BPF_FD, bpf_fd);
+ snprintf(annotation, sizeof(annotation), "%s:[%s]",
+ BPF_PROGRAM, section);
+ nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation),
+ annotation);
+
+ if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0)
+ return -1;
+ nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+ /* Netlink message is now ready to be sent */
+ if (nl_send(pmd->nlsk_fd, &msg->nh) < 0)
+ return -1;
+ if (nl_recv_ack(pmd->nlsk_fd) < 0)
+ return -1;
+ LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next);
+ }
+
+ snprintf(annotation, sizeof(annotation), "%s:[%s]", BPF_PROGRAM,
+ section);
+ rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+ if (!rss_flow) {
+ RTE_LOG(ERR, PMD,
+ "Cannot allocate memory for rte_flow");
+ return -1;
+ }
+ msg = &rss_flow->msg;
+ tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER,
+ NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+ msg->t.tcm_info =
+ TC_H_MAKE((RTE_PMD_TAP_MAX_QUEUES + PRIORITY_OFFSET) << 16,
+ htons(ETH_P_ALL));
+ msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+ tap_flow_set_handle(rss_flow);
+ nlattr_add(&msg->nh, TCA_KIND, sizeof("flower"), "flower");
+ if (nlattr_nested_start(msg, TCA_OPTIONS) < 0)
+ return -1;
+
+ /* no fields for matching: all packets must match */
+ {
+ /* Actions */
+ struct action_data data[2] = {
+ [0] = {
+ .id = "bpf",
+ .bpf = {
+ .bpf_fd = bpf_fd,
+ .annotation = annotation,
+ },
+ },
+ [1] = {
+ .id = "gact",
+ .gact = {
+ /* continue */
+ .action = TC_ACT_UNSPEC,
+ },
+ },
+ };
+
+ if (add_actions(rss_flow, 2, data, TCA_FLOWER_ACT) < 0)
+ return -1;
+ }
+ nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+ nlattr_nested_finish(msg); /* nested TCA_OPTIONS */
+ /* Netlink message is now ready to be sent */
+ if (nl_send(pmd->nlsk_fd, &msg->nh) < 0)
+ return -1;
+ if (nl_recv_ack(pmd->nlsk_fd) < 0)
+ return -1;
+ LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next);
+
+ pmd->rss_enabled = 1;
+ return 0;
+}
+
/**
* Manage filter operations.
*