get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/81214/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 81214,
    "url": "https://patches.dpdk.org/api/patches/81214/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/1602958879-8558-17-git-send-email-timothy.mcdaniel@intel.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1602958879-8558-17-git-send-email-timothy.mcdaniel@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1602958879-8558-17-git-send-email-timothy.mcdaniel@intel.com",
    "date": "2020-10-17T18:21:13",
    "name": "[v2,16/22] event/dlb2: add dequeue and its burst variants",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "1853b65fb8b56f8aadcd6dc01ccb981356e143fa",
    "submitter": {
        "id": 826,
        "url": "https://patches.dpdk.org/api/people/826/?format=api",
        "name": "Timothy McDaniel",
        "email": "timothy.mcdaniel@intel.com"
    },
    "delegate": {
        "id": 310,
        "url": "https://patches.dpdk.org/api/users/310/?format=api",
        "username": "jerin",
        "first_name": "Jerin",
        "last_name": "Jacob",
        "email": "jerinj@marvell.com"
    },
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/1602958879-8558-17-git-send-email-timothy.mcdaniel@intel.com/mbox/",
    "series": [
        {
            "id": 13075,
            "url": "https://patches.dpdk.org/api/series/13075/?format=api",
            "web_url": "https://patches.dpdk.org/project/dpdk/list/?series=13075",
            "date": "2020-10-17T18:20:57",
            "name": "Add DLB2 PMD",
            "version": 2,
            "mbox": "https://patches.dpdk.org/series/13075/mbox/"
        }
    ],
    "comments": "https://patches.dpdk.org/api/patches/81214/comments/",
    "check": "success",
    "checks": "https://patches.dpdk.org/api/patches/81214/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 68D40A04DB;\n\tSat, 17 Oct 2020 20:25:34 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id A0EA9CF99;\n\tSat, 17 Oct 2020 20:19:52 +0200 (CEST)",
            "from mga12.intel.com (mga12.intel.com [192.55.52.136])\n by dpdk.org (Postfix) with ESMTP id 2ACA5CA46\n for <dev@dpdk.org>; Sat, 17 Oct 2020 20:19:35 +0200 (CEST)",
            "from orsmga005.jf.intel.com ([10.7.209.41])\n by fmsmga106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;\n 17 Oct 2020 11:19:34 -0700",
            "from txasoft-yocto.an.intel.com ([10.123.72.192])\n by orsmga005.jf.intel.com with ESMTP; 17 Oct 2020 11:19:33 -0700"
        ],
        "IronPort-SDR": [
            "\n m3txu6d4JPP8mrlercyDim9YA9whThYjTZ8iTXK3L0OaUjv9Kh+0Va50Aa5uZkt/39txcpLpax\n SAZjZ+gn6qlw==",
            "\n d/JgsnXT8A6RQZJVeFHkJtGNtMQ040kn1TsJttFUmaSg5K76OJYh5JHRtnOw5SEqaSErz4GD88\n ix3W+SA6fWeQ=="
        ],
        "X-IronPort-AV": [
            "E=McAfee;i=\"6000,8403,9777\"; a=\"146122191\"",
            "E=Sophos;i=\"5.77,387,1596524400\"; d=\"scan'208\";a=\"146122191\"",
            "E=Sophos;i=\"5.77,387,1596524400\"; d=\"scan'208\";a=\"532129682\""
        ],
        "X-Amp-Result": "SKIPPED(no attachment in message)",
        "X-Amp-File-Uploaded": "False",
        "X-ExtLoop1": "1",
        "From": "Timothy McDaniel <timothy.mcdaniel@intel.com>",
        "To": "",
        "Cc": "dev@dpdk.org, erik.g.carrillo@intel.com, gage.eads@intel.com,\n harry.van.haaren@intel.com, jerinj@marvell.com",
        "Date": "Sat, 17 Oct 2020 13:21:13 -0500",
        "Message-Id": "<1602958879-8558-17-git-send-email-timothy.mcdaniel@intel.com>",
        "X-Mailer": "git-send-email 1.7.10",
        "In-Reply-To": "<1602958879-8558-1-git-send-email-timothy.mcdaniel@intel.com>",
        "References": "<1599855987-25976-2-git-send-email-timothy.mcdaniel@intel.com>\n <1602958879-8558-1-git-send-email-timothy.mcdaniel@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v2 16/22] event/dlb2: add dequeue and its burst\n\tvariants",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Add support for dequeue, dequeue_burst, ...\n\nDLB2 does not currently support interrupts, but instead use\numonitor/umwait if supported by the processor. This allows\nthe software to monitor and wait on writes to a cache-line.\n\nDLB2 supports normal and sparse cq mode. In normal mode the\nhardware will pack 4 QEs into each cache line. In sparse cq\nmode, the hardware will only populate one QE per cache line.\nSoftware must be aware of the cq mode, and take the appropriate\nactions, based on the mode.\n\nSigned-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>\n---\n drivers/event/dlb2/dlb2.c | 761 ++++++++++++++++++++++++++++++++++++++++++++++\n 1 file changed, 761 insertions(+)",
    "diff": "diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c\nindex 6cef9cb..417c5d0 100644\n--- a/drivers/event/dlb2/dlb2.c\n+++ b/drivers/event/dlb2/dlb2.c\n@@ -2665,9 +2665,761 @@ dlb2_event_enqueue_forward_burst(void *event_port,\n \treturn dlb2_event_enqueue_burst(event_port, events, num);\n }\n \n+static inline void\n+dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)\n+{\n+\tuint32_t batch_size = DLB2_SW_CREDIT_BATCH_SZ;\n+\n+\t/* increment port credits, and return to pool if exceeds threshold */\n+\tif (!qm_port->is_directed) {\n+\t\tqm_port->cached_ldb_credits += num;\n+\t\tif (qm_port->cached_ldb_credits >= 2 * batch_size) {\n+\t\t\t__atomic_fetch_add(\n+\t\t\t\tqm_port->credit_pool[DLB2_LDB_QUEUE],\n+\t\t\t\tbatch_size, __ATOMIC_SEQ_CST);\n+\t\t\tqm_port->cached_ldb_credits -= batch_size;\n+\t\t}\n+\t} else {\n+\t\tqm_port->cached_dir_credits += num;\n+\t\tif (qm_port->cached_dir_credits >= 2 * batch_size) {\n+\t\t\t__atomic_fetch_add(\n+\t\t\t\tqm_port->credit_pool[DLB2_DIR_QUEUE],\n+\t\t\t\tbatch_size, __ATOMIC_SEQ_CST);\n+\t\t\tqm_port->cached_dir_credits -= batch_size;\n+\t\t}\n+\t}\n+}\n+\n+static inline bool\n+dlb2_cq_is_empty(struct dlb2_port *qm_port)\n+{\n+\tvolatile struct dlb2_dequeue_qe *qe_ptr;\n+\tstruct dlb2_dequeue_qe qe;\n+\n+\tqe_ptr = dlb2_port[qm_port->id][PORT_TYPE(qm_port)].cq_base;\n+\tqe = qe_ptr[qm_port->cq_idx];\n+\n+\treturn (qe.cq_gen != qm_port->gen_bit);\n+}\n+\n+static inline int\n+dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,\n+\t\t  struct dlb2_eventdev_port *ev_port,\n+\t\t  struct dlb2_port *qm_port,\n+\t\t  uint64_t timeout,\n+\t\t  uint64_t start_ticks)\n+{\n+\tstruct process_local_port_data *port_data;\n+\tuint64_t elapsed_ticks;\n+\n+\tport_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];\n+\n+\telapsed_ticks = rte_get_timer_cycles() - start_ticks;\n+\n+\t/* Wait/poll time expired */\n+\tif (elapsed_ticks >= timeout) {\n+\t\treturn 1;\n+\t} else if (dlb2->umwait_allowed) {\n+\t\tvolatile struct dlb2_dequeue_qe *cq_base;\n+\n+\t\tcq_base = port_data->cq_base;\n+\n+\t\t/* Block on cache line write to CQ. Note: it's\n+\t\t * safe to access the per-process cq_base\n+\t\t * address here, since the PMD has already\n+\t\t * attempted at least one CQ dequeue.\n+\t\t */\n+\t\tdlb2_umonitor(&cq_base[qm_port->cq_idx]);\n+\n+\t\t/* Avoid race condition. Check if still empty */\n+\t\tif (dlb2_cq_is_empty(qm_port)) {\n+\t\t\tdlb2_umwait(RTE_LIBRTE_PMD_DLB2_UMWAIT_CTL_STATE,\n+\t\t\t\t    timeout + start_ticks);\n+\t\t\tDLB2_INC_STAT(\n+\t\t\t\tev_port->stats.traffic.rx_umonitor_umwait, 1);\n+\t\t}\n+\t} else {\n+\t\tuint64_t poll_interval = RTE_LIBRTE_PMD_DLB2_POLL_INTERVAL;\n+\t\tuint64_t curr_ticks = rte_get_timer_cycles();\n+\t\tuint64_t init_ticks = curr_ticks;\n+\n+\t\twhile ((curr_ticks - start_ticks < timeout) &&\n+\t\t       (curr_ticks - init_ticks < poll_interval))\n+\t\t\tcurr_ticks = rte_get_timer_cycles();\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static inline int\n+dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,\n+\t\t\t struct dlb2_port *qm_port,\n+\t\t\t struct rte_event *events,\n+\t\t\t struct dlb2_dequeue_qe *qes,\n+\t\t\t int cnt)\n+{\n+\tuint8_t *qid_mappings = qm_port->qid_mappings;\n+\tint i, num, evq_id;\n+\n+\tfor (i = 0, num = 0; i < cnt; i++) {\n+\t\tstruct dlb2_dequeue_qe *qe = &qes[i];\n+\t\tint sched_type_map[DLB2_NUM_HW_SCHED_TYPES] = {\n+\t\t\t[DLB2_SCHED_ATOMIC] = RTE_SCHED_TYPE_ATOMIC,\n+\t\t\t[DLB2_SCHED_UNORDERED] = RTE_SCHED_TYPE_PARALLEL,\n+\t\t\t[DLB2_SCHED_ORDERED] = RTE_SCHED_TYPE_ORDERED,\n+\t\t\t[DLB2_SCHED_DIRECTED] = RTE_SCHED_TYPE_ATOMIC,\n+\t\t};\n+\n+\t\t/* Fill in event information.\n+\t\t * Note that flow_id must be embedded in the data by\n+\t\t * the app, such as the mbuf RSS hash field if the data\n+\t\t * buffer is a mbuf.\n+\t\t */\n+\t\tif (unlikely(qe->error)) {\n+\t\t\tDLB2_LOG_ERR(\"QE error bit ON\\n\");\n+\t\t\tDLB2_INC_STAT(ev_port->stats.traffic.rx_drop, 1);\n+\t\t\tdlb2_consume_qe_immediate(qm_port, 1);\n+\t\t\tcontinue; /* Ignore */\n+\t\t}\n+\n+\t\tevents[num].u64 = qe->data;\n+\t\tevents[num].flow_id = qe->flow_id;\n+\t\tevents[num].priority = DLB2_TO_EV_PRIO((uint8_t)qe->priority);\n+\t\tevents[num].event_type = qe->u.event_type.major;\n+\t\tevents[num].sub_event_type = qe->u.event_type.sub;\n+\t\tevents[num].sched_type = sched_type_map[qe->sched_type];\n+\t\tevents[num].impl_opaque = qe->qid_depth;\n+\n+\t\t/* qid not preserved for directed queues */\n+\t\tif (qm_port->is_directed)\n+\t\t\tevq_id = ev_port->link[0].queue_id;\n+\t\telse\n+\t\t\tevq_id = qid_mappings[qe->qid];\n+\n+\t\tevents[num].queue_id = evq_id;\n+\t\tDLB2_INC_STAT(\n+\t\t\tev_port->stats.queue[evq_id].qid_depth[qe->qid_depth],\n+\t\t\t1);\n+\t\tDLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qe->sched_type], 1);\n+\t\tnum++;\n+\t}\n+\n+\tDLB2_INC_STAT(ev_port->stats.traffic.rx_ok, num);\n+\n+\treturn num;\n+}\n+\n+static inline int\n+dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,\n+\t\t\t      struct dlb2_port *qm_port,\n+\t\t\t      struct rte_event *events,\n+\t\t\t      struct dlb2_dequeue_qe *qes)\n+{\n+\tint sched_type_map[] = {\n+\t\t[DLB2_SCHED_ATOMIC] = RTE_SCHED_TYPE_ATOMIC,\n+\t\t[DLB2_SCHED_UNORDERED] = RTE_SCHED_TYPE_PARALLEL,\n+\t\t[DLB2_SCHED_ORDERED] = RTE_SCHED_TYPE_ORDERED,\n+\t\t[DLB2_SCHED_DIRECTED] = RTE_SCHED_TYPE_ATOMIC,\n+\t};\n+\tconst int num_events = DLB2_NUM_QES_PER_CACHE_LINE;\n+\tuint8_t *qid_mappings = qm_port->qid_mappings;\n+\t__m128i sse_evt[2];\n+\n+\t/* In the unlikely case that any of the QE error bits are set, process\n+\t * them one at a time.\n+\t */\n+\tif (unlikely(qes[0].error || qes[1].error ||\n+\t\t     qes[2].error || qes[3].error))\n+\t\treturn dlb2_process_dequeue_qes(ev_port, qm_port, events,\n+\t\t\t\t\t\t qes, num_events);\n+\n+\tevents[0].u64 = qes[0].data;\n+\tevents[1].u64 = qes[1].data;\n+\tevents[2].u64 = qes[2].data;\n+\tevents[3].u64 = qes[3].data;\n+\n+\t/* Construct the metadata portion of two struct rte_events\n+\t * in one 128b SSE register. Event metadata is constructed in the SSE\n+\t * registers like so:\n+\t * sse_evt[0][63:0]:   event[0]'s metadata\n+\t * sse_evt[0][127:64]: event[1]'s metadata\n+\t * sse_evt[1][63:0]:   event[2]'s metadata\n+\t * sse_evt[1][127:64]: event[3]'s metadata\n+\t */\n+\tsse_evt[0] = _mm_setzero_si128();\n+\tsse_evt[1] = _mm_setzero_si128();\n+\n+\t/* Convert the hardware queue ID to an event queue ID and store it in\n+\t * the metadata:\n+\t * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]\n+\t * sse_evt[0][111:104] = qid_mappings[qes[1].qid]\n+\t * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]\n+\t * sse_evt[1][111:104] = qid_mappings[qes[3].qid]\n+\t */\n+#define RTE_EVENT_QUEUE_ID_BYTE 5\n+\tsse_evt[0] = _mm_insert_epi8(sse_evt[0],\n+\t\t\t\t     qid_mappings[qes[0].qid],\n+\t\t\t\t     RTE_EVENT_QUEUE_ID_BYTE);\n+\tsse_evt[0] = _mm_insert_epi8(sse_evt[0],\n+\t\t\t\t     qid_mappings[qes[1].qid],\n+\t\t\t\t     RTE_EVENT_QUEUE_ID_BYTE + 8);\n+\tsse_evt[1] = _mm_insert_epi8(sse_evt[1],\n+\t\t\t\t     qid_mappings[qes[2].qid],\n+\t\t\t\t     RTE_EVENT_QUEUE_ID_BYTE);\n+\tsse_evt[1] = _mm_insert_epi8(sse_evt[1],\n+\t\t\t\t     qid_mappings[qes[3].qid],\n+\t\t\t\t     RTE_EVENT_QUEUE_ID_BYTE + 8);\n+\n+\t/* Convert the hardware priority to an event priority and store it in\n+\t * the metadata, while also returning the queue depth status\n+\t * value captured by the hardware, storing it in impl_opaque, which can\n+\t * be read by the application but not modified\n+\t * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)\n+\t * sse_evt[0][63:56]   = qes[0].qid_depth\n+\t * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)\n+\t * sse_evt[0][127:120] = qes[1].qid_depth\n+\t * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)\n+\t * sse_evt[1][63:56]   = qes[2].qid_depth\n+\t * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)\n+\t * sse_evt[1][127:120] = qes[3].qid_depth\n+\t */\n+#define RTE_EVENT_PRIO_IMPL_OPAQUE_WORD 3\n+#define RTE_BYTE_SHIFT 8\n+\tsse_evt[0] =\n+\t\t_mm_insert_epi16(sse_evt[0],\n+\t\t\tDLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |\n+\t\t\t(qes[0].qid_depth << RTE_BYTE_SHIFT),\n+\t\t\tRTE_EVENT_PRIO_IMPL_OPAQUE_WORD);\n+\tsse_evt[0] =\n+\t\t_mm_insert_epi16(sse_evt[0],\n+\t\t\tDLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |\n+\t\t\t(qes[1].qid_depth << RTE_BYTE_SHIFT),\n+\t\t\tRTE_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);\n+\tsse_evt[1] =\n+\t\t_mm_insert_epi16(sse_evt[1],\n+\t\t\tDLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |\n+\t\t\t(qes[2].qid_depth << RTE_BYTE_SHIFT),\n+\t\t\tRTE_EVENT_PRIO_IMPL_OPAQUE_WORD);\n+\tsse_evt[1] =\n+\t\t_mm_insert_epi16(sse_evt[1],\n+\t\t\tDLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |\n+\t\t\t(qes[3].qid_depth << RTE_BYTE_SHIFT),\n+\t\t\tRTE_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);\n+\n+\t/* Write the event type, sub event type, and flow_id to the event\n+\t * metadata.\n+\t * sse_evt[0][31:0]   = qes[0].flow_id |\n+\t *\t\t\tqes[0].u.event_type.major << 28 |\n+\t *\t\t\tqes[0].u.event_type.sub << 20;\n+\t * sse_evt[0][95:64]  = qes[1].flow_id |\n+\t *\t\t\tqes[1].u.event_type.major << 28 |\n+\t *\t\t\tqes[1].u.event_type.sub << 20;\n+\t * sse_evt[1][31:0]   = qes[2].flow_id |\n+\t *\t\t\tqes[2].u.event_type.major << 28 |\n+\t *\t\t\tqes[2].u.event_type.sub << 20;\n+\t * sse_evt[1][95:64]  = qes[3].flow_id |\n+\t *\t\t\tqes[3].u.event_type.major << 28 |\n+\t *\t\t\tqes[3].u.event_type.sub << 20;\n+\t */\n+#define RTE_EVENT_EV_TYPE_DW 0\n+#define RTE_EVENT_EV_TYPE_SHIFT 28\n+#define RTE_EVENT_SUB_EV_TYPE_SHIFT 20\n+\tsse_evt[0] = _mm_insert_epi32(sse_evt[0],\n+\t\t\tqes[0].flow_id |\n+\t\t\tqes[0].u.event_type.major << RTE_EVENT_EV_TYPE_SHIFT |\n+\t\t\tqes[0].u.event_type.sub <<  RTE_EVENT_SUB_EV_TYPE_SHIFT,\n+\t\t\tRTE_EVENT_EV_TYPE_DW);\n+\tsse_evt[0] = _mm_insert_epi32(sse_evt[0],\n+\t\t\tqes[1].flow_id |\n+\t\t\tqes[1].u.event_type.major << RTE_EVENT_EV_TYPE_SHIFT |\n+\t\t\tqes[1].u.event_type.sub <<  RTE_EVENT_SUB_EV_TYPE_SHIFT,\n+\t\t\tRTE_EVENT_EV_TYPE_DW + 2);\n+\tsse_evt[1] = _mm_insert_epi32(sse_evt[1],\n+\t\t\tqes[2].flow_id |\n+\t\t\tqes[2].u.event_type.major << RTE_EVENT_EV_TYPE_SHIFT |\n+\t\t\tqes[2].u.event_type.sub <<  RTE_EVENT_SUB_EV_TYPE_SHIFT,\n+\t\t\tRTE_EVENT_EV_TYPE_DW);\n+\tsse_evt[1] = _mm_insert_epi32(sse_evt[1],\n+\t\t\tqes[3].flow_id |\n+\t\t\tqes[3].u.event_type.major << RTE_EVENT_EV_TYPE_SHIFT  |\n+\t\t\tqes[3].u.event_type.sub << RTE_EVENT_SUB_EV_TYPE_SHIFT,\n+\t\t\tRTE_EVENT_EV_TYPE_DW + 2);\n+\n+\t/* Write the sched type to the event metadata. 'op' and 'rsvd' are not\n+\t * set:\n+\t * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6\n+\t * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6\n+\t * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6\n+\t * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6\n+\t */\n+#define RTE_EVENT_SCHED_TYPE_BYTE 4\n+#define RTE_EVENT_SCHED_TYPE_SHIFT 6\n+\tsse_evt[0] = _mm_insert_epi8(sse_evt[0],\n+\t\tsched_type_map[qes[0].sched_type] << RTE_EVENT_SCHED_TYPE_SHIFT,\n+\t\tRTE_EVENT_SCHED_TYPE_BYTE);\n+\tsse_evt[0] = _mm_insert_epi8(sse_evt[0],\n+\t\tsched_type_map[qes[1].sched_type] << RTE_EVENT_SCHED_TYPE_SHIFT,\n+\t\tRTE_EVENT_SCHED_TYPE_BYTE + 8);\n+\tsse_evt[1] = _mm_insert_epi8(sse_evt[1],\n+\t\tsched_type_map[qes[2].sched_type] << RTE_EVENT_SCHED_TYPE_SHIFT,\n+\t\tRTE_EVENT_SCHED_TYPE_BYTE);\n+\tsse_evt[1] = _mm_insert_epi8(sse_evt[1],\n+\t\tsched_type_map[qes[3].sched_type] << RTE_EVENT_SCHED_TYPE_SHIFT,\n+\t\tRTE_EVENT_SCHED_TYPE_BYTE + 8);\n+\n+\t/* Store the metadata to the event (use the double-precision\n+\t * _mm_storeh_pd because there is no integer function for storing the\n+\t * upper 64b):\n+\t * events[0].event = sse_evt[0][63:0]\n+\t * events[1].event = sse_evt[0][127:64]\n+\t * events[2].event = sse_evt[1][63:0]\n+\t * events[3].event = sse_evt[1][127:64]\n+\t */\n+\t_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);\n+\t_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);\n+\t_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);\n+\t_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);\n+\n+\tDLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);\n+\tDLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);\n+\tDLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[2].sched_type], 1);\n+\tDLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[3].sched_type], 1);\n+\n+\tDLB2_INC_STAT(\n+\t\tev_port->stats.queue[events[0].queue_id].\n+\t\t\tqid_depth[qes[0].qid_depth],\n+\t\t1);\n+\tDLB2_INC_STAT(\n+\t\tev_port->stats.queue[events[1].queue_id].\n+\t\t\tqid_depth[qes[1].qid_depth],\n+\t\t1);\n+\tDLB2_INC_STAT(\n+\t\tev_port->stats.queue[events[2].queue_id].\n+\t\t\tqid_depth[qes[2].qid_depth],\n+\t\t1);\n+\tDLB2_INC_STAT(\n+\t\tev_port->stats.queue[events[3].queue_id].\n+\t\t\tqid_depth[qes[3].qid_depth],\n+\t\t1);\n+\n+\tDLB2_INC_STAT(ev_port->stats.traffic.rx_ok, num_events);\n+\n+\treturn num_events;\n+}\n+\n+static __rte_always_inline int\n+dlb2_recv_qe_sparse(struct dlb2_port *qm_port, struct dlb2_dequeue_qe *qe)\n+{\n+\tvolatile struct dlb2_dequeue_qe *cq_addr;\n+\tuint8_t xor_mask[2] = {0x0F, 0x00};\n+\tconst uint8_t and_mask = 0x0F;\n+\t__m128i *qes = (__m128i *)qe;\n+\tuint8_t gen_bits, gen_bit;\n+\tuintptr_t addr[4];\n+\tuint16_t idx;\n+\n+\tcq_addr = dlb2_port[qm_port->id][PORT_TYPE(qm_port)].cq_base;\n+\n+\tidx = qm_port->cq_idx;\n+\n+\t/* Load the next 4 QEs */\n+\taddr[0] = (uintptr_t)&cq_addr[idx];\n+\taddr[1] = (uintptr_t)&cq_addr[(idx +  4) & qm_port->cq_depth_mask];\n+\taddr[2] = (uintptr_t)&cq_addr[(idx +  8) & qm_port->cq_depth_mask];\n+\taddr[3] = (uintptr_t)&cq_addr[(idx + 12) & qm_port->cq_depth_mask];\n+\n+\t/* Prefetch next batch of QEs (all CQs occupy minimum 8 cache lines) */\n+\trte_prefetch0(&cq_addr[(idx + 16) & qm_port->cq_depth_mask]);\n+\trte_prefetch0(&cq_addr[(idx + 20) & qm_port->cq_depth_mask]);\n+\trte_prefetch0(&cq_addr[(idx + 24) & qm_port->cq_depth_mask]);\n+\trte_prefetch0(&cq_addr[(idx + 28) & qm_port->cq_depth_mask]);\n+\n+\t/* Correct the xor_mask for wrap-around QEs */\n+\tgen_bit = qm_port->gen_bit;\n+\txor_mask[gen_bit] ^= !!((idx +  4) > qm_port->cq_depth_mask) << 1;\n+\txor_mask[gen_bit] ^= !!((idx +  8) > qm_port->cq_depth_mask) << 2;\n+\txor_mask[gen_bit] ^= !!((idx + 12) > qm_port->cq_depth_mask) << 3;\n+\n+\t/* Read the cache lines backwards to ensure that if QE[N] (N > 0) is\n+\t * valid, then QEs[0:N-1] are too.\n+\t */\n+\tqes[3] = _mm_load_si128((__m128i *)(void *)addr[3]);\n+\trte_compiler_barrier();\n+\tqes[2] = _mm_load_si128((__m128i *)(void *)addr[2]);\n+\trte_compiler_barrier();\n+\tqes[1] = _mm_load_si128((__m128i *)(void *)addr[1]);\n+\trte_compiler_barrier();\n+\tqes[0] = _mm_load_si128((__m128i *)(void *)addr[0]);\n+\n+\t/* Extract and combine the gen bits */\n+\tgen_bits = ((_mm_extract_epi8(qes[0], 15) & 0x1) << 0) |\n+\t\t   ((_mm_extract_epi8(qes[1], 15) & 0x1) << 1) |\n+\t\t   ((_mm_extract_epi8(qes[2], 15) & 0x1) << 2) |\n+\t\t   ((_mm_extract_epi8(qes[3], 15) & 0x1) << 3);\n+\n+\t/* XOR the combined bits such that a 1 represents a valid QE */\n+\tgen_bits ^= xor_mask[gen_bit];\n+\n+\t/* Mask off gen bits we don't care about */\n+\tgen_bits &= and_mask;\n+\n+\treturn __builtin_popcount(gen_bits);\n+}\n+\n+static inline void\n+dlb2_inc_cq_idx(struct dlb2_port *qm_port, int cnt)\n+{\n+\tuint16_t idx = qm_port->cq_idx_unmasked + cnt;\n+\n+\tqm_port->cq_idx_unmasked = idx;\n+\tqm_port->cq_idx = idx & qm_port->cq_depth_mask;\n+\tqm_port->gen_bit = (~(idx >> qm_port->gen_bit_shift)) & 0x1;\n+}\n+\n+static int\n+dlb2_event_release(struct dlb2_eventdev *dlb2,\n+\t\t   uint8_t port_id,\n+\t\t   int n)\n+{\n+\tstruct process_local_port_data *port_data;\n+\tstruct dlb2_eventdev_port *ev_port;\n+\tstruct dlb2_port *qm_port;\n+\tint i, cnt;\n+\n+\tif (port_id > dlb2->num_ports) {\n+\t\tDLB2_LOG_ERR(\"Invalid port id %d in dlb2-event_release\\n\",\n+\t\t\t     port_id);\n+\t\trte_errno = -EINVAL;\n+\t\treturn rte_errno;\n+\t}\n+\n+\tev_port = &dlb2->ev_ports[port_id];\n+\tqm_port = &ev_port->qm_port;\n+\tport_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];\n+\n+\tcnt = 0;\n+\n+\tif (qm_port->is_directed) {\n+\t\tcnt = n;\n+\t\tgoto sw_credit_update;\n+\t}\n+\n+\tfor (i = 0; i < n; i += DLB2_NUM_QES_PER_CACHE_LINE) {\n+\t\tint j;\n+\n+\t\t/* Zero-out QEs */\n+\t\tqm_port->qe4[0].cmd_byte = 0;\n+\t\tqm_port->qe4[1].cmd_byte = 0;\n+\t\tqm_port->qe4[2].cmd_byte = 0;\n+\t\tqm_port->qe4[3].cmd_byte = 0;\n+\n+\t\tfor (j = 0; j < DLB2_NUM_QES_PER_CACHE_LINE && (i + j) < n; j++)\n+\t\t\tqm_port->qe4[j].cmd_byte = DLB2_COMP_CMD_BYTE;\n+\n+\t\tqm_port->issued_releases += j;\n+\n+\t\tif (j == 0)\n+\t\t\tbreak;\n+\n+\t\tdlb2_hw_do_enqueue(qm_port, i == 0, port_data);\n+\n+\t\tcnt += j;\n+\t}\n+\n+sw_credit_update:\n+\t/* each release returns one credit */\n+\tif (!ev_port->outstanding_releases) {\n+\t\tDLB2_LOG_ERR(\"Unrecoverable application error. Outstanding releases underflowed.\\n\");\n+\t\trte_errno = -ENOTRECOVERABLE;\n+\t\treturn rte_errno;\n+\t}\n+\n+\tev_port->outstanding_releases -= cnt;\n+\tev_port->inflight_credits += cnt;\n+\n+\t/* Replenish s/w credits if enough releases are performed */\n+\tdlb2_replenish_sw_credits(dlb2, ev_port);\n+\treturn 0;\n+}\n+\n+static inline int16_t\n+dlb2_hw_dequeue_sparse(struct dlb2_eventdev *dlb2,\n+\t\t       struct dlb2_eventdev_port *ev_port,\n+\t\t       struct rte_event *events,\n+\t\t       uint16_t max_num,\n+\t\t       uint64_t dequeue_timeout_ticks)\n+{\n+\tuint64_t timeout;\n+\tuint64_t start_ticks = 0ULL;\n+\tstruct dlb2_port *qm_port;\n+\tint num = 0;\n+\n+\tqm_port = &ev_port->qm_port;\n+\n+\t/* We have a special implementation for waiting. Wait can be:\n+\t * 1) no waiting at all\n+\t * 2) busy poll only\n+\t * 3) wait for interrupt. If wakeup and poll time\n+\t * has expired, then return to caller\n+\t * 4) umonitor/umwait repeatedly up to poll time\n+\t */\n+\n+\t/* If configured for per dequeue wait, then use wait value provided\n+\t * to this API. Otherwise we must use the global\n+\t * value from eventdev config time.\n+\t */\n+\tif (!dlb2->global_dequeue_wait)\n+\t\ttimeout = dequeue_timeout_ticks;\n+\telse\n+\t\ttimeout = dlb2->global_dequeue_wait_ticks;\n+\n+\tstart_ticks = rte_get_timer_cycles();\n+\n+\twhile (num < max_num) {\n+\t\tstruct dlb2_dequeue_qe qes[DLB2_NUM_QES_PER_CACHE_LINE];\n+\t\tint num_avail;\n+\n+\t\t/* Copy up to 4 QEs from the current cache line into qes */\n+\t\tnum_avail = dlb2_recv_qe_sparse(qm_port, qes);\n+\n+\t\t/* But don't process more than the user requested */\n+\t\tnum_avail = RTE_MIN(num_avail, max_num - num);\n+\n+\t\tdlb2_inc_cq_idx(qm_port, num_avail << 2);\n+\n+\t\tif (num_avail == DLB2_NUM_QES_PER_CACHE_LINE)\n+\t\t\tnum += dlb2_process_dequeue_four_qes(ev_port,\n+\t\t\t\t\t\t\t      qm_port,\n+\t\t\t\t\t\t\t      &events[num],\n+\t\t\t\t\t\t\t      &qes[0]);\n+\t\telse if (num_avail)\n+\t\t\tnum += dlb2_process_dequeue_qes(ev_port,\n+\t\t\t\t\t\t\t qm_port,\n+\t\t\t\t\t\t\t &events[num],\n+\t\t\t\t\t\t\t &qes[0],\n+\t\t\t\t\t\t\t num_avail);\n+\t\telse if ((timeout == 0) || (num > 0))\n+\t\t\t/* Not waiting in any form, or 1+ events received? */\n+\t\t\tbreak;\n+\t\telse if (dlb2_dequeue_wait(dlb2, ev_port, qm_port,\n+\t\t\t\t\t   timeout, start_ticks))\n+\t\t\tbreak;\n+\t}\n+\n+\tqm_port->owed_tokens += num;\n+\n+\tif (num) {\n+\n+\t\tdlb2_consume_qe_immediate(qm_port, num);\n+\n+\t\tev_port->outstanding_releases += num;\n+\n+\t\tdlb2_port_credits_inc(qm_port, num);\n+\t}\n+\n+\treturn num;\n+}\n+\n+static __rte_always_inline int\n+dlb2_recv_qe(struct dlb2_port *qm_port, struct dlb2_dequeue_qe *qe,\n+\t     uint8_t *offset)\n+{\n+\tuint8_t xor_mask[2][4] = { {0x0F, 0x0E, 0x0C, 0x08},\n+\t\t\t\t   {0x00, 0x01, 0x03, 0x07} };\n+\tuint8_t and_mask[4] = {0x0F, 0x0E, 0x0C, 0x08};\n+\tvolatile struct dlb2_dequeue_qe *cq_addr;\n+\t__m128i *qes = (__m128i *)qe;\n+\tuint64_t *cache_line_base;\n+\tuint8_t gen_bits;\n+\n+\tcq_addr = dlb2_port[qm_port->id][PORT_TYPE(qm_port)].cq_base;\n+\tcq_addr = &cq_addr[qm_port->cq_idx];\n+\n+\tcache_line_base = (void *)(((uintptr_t)cq_addr) & ~0x3F);\n+\t*offset = ((uintptr_t)cq_addr & 0x30) >> 4;\n+\n+\t/* Load the next CQ cache line from memory. Pack these reads as tight\n+\t * as possible to reduce the chance that DLB invalidates the line while\n+\t * the CPU is reading it. Read the cache line backwards to ensure that\n+\t * if QE[N] (N > 0) is valid, then QEs[0:N-1] are too.\n+\t *\n+\t * (Valid QEs start at &qe[offset])\n+\t */\n+\tqes[3] = _mm_load_si128((__m128i *)&cache_line_base[6]);\n+\tqes[2] = _mm_load_si128((__m128i *)&cache_line_base[4]);\n+\tqes[1] = _mm_load_si128((__m128i *)&cache_line_base[2]);\n+\tqes[0] = _mm_load_si128((__m128i *)&cache_line_base[0]);\n+\n+\t/* Evict the cache line ASAP */\n+\tdlb2_cldemote(cache_line_base);\n+\n+\t/* Extract and combine the gen bits */\n+\tgen_bits = ((_mm_extract_epi8(qes[0], 15) & 0x1) << 0) |\n+\t\t   ((_mm_extract_epi8(qes[1], 15) & 0x1) << 1) |\n+\t\t   ((_mm_extract_epi8(qes[2], 15) & 0x1) << 2) |\n+\t\t   ((_mm_extract_epi8(qes[3], 15) & 0x1) << 3);\n+\n+\t/* XOR the combined bits such that a 1 represents a valid QE */\n+\tgen_bits ^= xor_mask[qm_port->gen_bit][*offset];\n+\n+\t/* Mask off gen bits we don't care about */\n+\tgen_bits &= and_mask[*offset];\n+\n+\treturn __builtin_popcount(gen_bits);\n+}\n+\n+static inline int16_t\n+dlb2_hw_dequeue(struct dlb2_eventdev *dlb2,\n+\t\tstruct dlb2_eventdev_port *ev_port,\n+\t\tstruct rte_event *events,\n+\t\tuint16_t max_num,\n+\t\tuint64_t dequeue_timeout_ticks)\n+{\n+\tuint64_t timeout;\n+\tuint64_t start_ticks = 0ULL;\n+\tstruct dlb2_port *qm_port;\n+\tint num = 0;\n+\n+\tqm_port = &ev_port->qm_port;\n+\n+\t/* We have a special implementation for waiting. Wait can be:\n+\t * 1) no waiting at all\n+\t * 2) busy poll only\n+\t * 3) wait for interrupt. If wakeup and poll time\n+\t * has expired, then return to caller\n+\t * 4) umonitor/umwait repeatedly up to poll time\n+\t */\n+\n+\t/* If configured for per dequeue wait, then use wait value provided\n+\t * to this API. Otherwise we must use the global\n+\t * value from eventdev config time.\n+\t */\n+\tif (!dlb2->global_dequeue_wait)\n+\t\ttimeout = dequeue_timeout_ticks;\n+\telse\n+\t\ttimeout = dlb2->global_dequeue_wait_ticks;\n+\n+\tstart_ticks = rte_get_timer_cycles();\n+\n+\twhile (num < max_num) {\n+\t\tstruct dlb2_dequeue_qe qes[DLB2_NUM_QES_PER_CACHE_LINE];\n+\t\tuint8_t offset;\n+\t\tint num_avail;\n+\n+\t\t/* Copy up to 4 QEs from the current cache line into qes */\n+\t\tnum_avail = dlb2_recv_qe(qm_port, qes, &offset);\n+\n+\t\t/* But don't process more than the user requested */\n+\t\tnum_avail = RTE_MIN(num_avail, max_num - num);\n+\n+\t\tdlb2_inc_cq_idx(qm_port, num_avail);\n+\n+\t\tif (num_avail == DLB2_NUM_QES_PER_CACHE_LINE)\n+\t\t\tnum += dlb2_process_dequeue_four_qes(ev_port,\n+\t\t\t\t\t\t\t     qm_port,\n+\t\t\t\t\t\t\t     &events[num],\n+\t\t\t\t\t\t\t     &qes[offset]);\n+\t\telse if (num_avail)\n+\t\t\tnum += dlb2_process_dequeue_qes(ev_port,\n+\t\t\t\t\t\t\tqm_port,\n+\t\t\t\t\t\t\t&events[num],\n+\t\t\t\t\t\t\t&qes[offset],\n+\t\t\t\t\t\t\tnum_avail);\n+\t\telse if ((timeout == 0) || (num > 0))\n+\t\t\t/* Not waiting in any form, or 1+ events received? */\n+\t\t\tbreak;\n+\t\telse if (dlb2_dequeue_wait(dlb2, ev_port, qm_port,\n+\t\t\t\t\t   timeout, start_ticks))\n+\t\t\tbreak;\n+\t}\n+\n+\tqm_port->owed_tokens += num;\n+\n+\tif (num) {\n+\n+\t\tdlb2_consume_qe_immediate(qm_port, num);\n+\n+\t\tev_port->outstanding_releases += num;\n+\n+\t\tdlb2_port_credits_inc(qm_port, num);\n+\t}\n+\n+\treturn num;\n+}\n+\n+static uint16_t\n+dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,\n+\t\t\t uint64_t wait)\n+{\n+\tstruct dlb2_eventdev_port *ev_port = event_port;\n+\tstruct dlb2_eventdev *dlb2 = ev_port->dlb2;\n+\tuint16_t cnt;\n+\n+\tRTE_ASSERT(ev_port->setup_done);\n+\tRTE_ASSERT(ev != NULL);\n+\n+\tif (ev_port->implicit_release && ev_port->outstanding_releases > 0) {\n+\t\tuint16_t out_rels = ev_port->outstanding_releases;\n+\n+\t\tif (dlb2_event_release(dlb2, ev_port->id, out_rels))\n+\t\t\treturn 0; /* rte_errno is set */\n+\n+\t\tDLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);\n+\t}\n+\n+\tcnt = dlb2_hw_dequeue(dlb2, ev_port, ev, num, wait);\n+\n+\tDLB2_INC_STAT(ev_port->stats.traffic.total_polls, 1);\n+\tDLB2_INC_STAT(ev_port->stats.traffic.zero_polls, ((cnt == 0) ? 1 : 0));\n+\n+\treturn cnt;\n+}\n+\n+static uint16_t\n+dlb2_event_dequeue(void *event_port, struct rte_event *ev, uint64_t wait)\n+{\n+\treturn dlb2_event_dequeue_burst(event_port, ev, 1, wait);\n+}\n+\n+static uint16_t\n+dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,\n+\t\t\t\tuint16_t num, uint64_t wait)\n+{\n+\tstruct dlb2_eventdev_port *ev_port = event_port;\n+\tstruct dlb2_eventdev *dlb2 = ev_port->dlb2;\n+\tuint16_t cnt;\n+\n+\tRTE_ASSERT(ev_port->setup_done);\n+\tRTE_ASSERT(ev != NULL);\n+\n+\tif (ev_port->implicit_release && ev_port->outstanding_releases > 0) {\n+\t\tuint16_t out_rels = ev_port->outstanding_releases;\n+\n+\t\tif (dlb2_event_release(dlb2, ev_port->id, out_rels))\n+\t\t\treturn 0; /* rte_errno is set */\n+\n+\t\tDLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);\n+\t}\n+\n+\tcnt = dlb2_hw_dequeue_sparse(dlb2, ev_port, ev, num, wait);\n+\n+\tDLB2_INC_STAT(ev_port->stats.traffic.total_polls, 1);\n+\tDLB2_INC_STAT(ev_port->stats.traffic.zero_polls, ((cnt == 0) ? 1 : 0));\n+\treturn cnt;\n+}\n+\n+static uint16_t\n+dlb2_event_dequeue_sparse(void *event_port, struct rte_event *ev,\n+\t\t\t  uint64_t wait)\n+{\n+\treturn dlb2_event_dequeue_burst_sparse(event_port, ev, 1, wait);\n+}\n+\n static void\n dlb2_entry_points_init(struct rte_eventdev *dev)\n {\n+\tstruct dlb2_eventdev *dlb2;\n+\n \t/* Expose PMD's eventdev interface */\n \tstatic struct rte_eventdev_ops dlb2_eventdev_entry_ops = {\n \t\t.dev_infos_get    = dlb2_eventdev_info_get,\n@@ -2695,6 +3447,15 @@ dlb2_entry_points_init(struct rte_eventdev *dev)\n \tdev->enqueue_burst = dlb2_event_enqueue_burst;\n \tdev->enqueue_new_burst = dlb2_event_enqueue_new_burst;\n \tdev->enqueue_forward_burst = dlb2_event_enqueue_forward_burst;\n+\n+\tdlb2 = dev->data->dev_private;\n+\tif (dlb2->poll_mode == DLB2_CQ_POLL_MODE_SPARSE) {\n+\t\tdev->dequeue = dlb2_event_dequeue_sparse;\n+\t\tdev->dequeue_burst = dlb2_event_dequeue_burst_sparse;\n+\t} else {\n+\t\tdev->dequeue = dlb2_event_dequeue;\n+\t\tdev->dequeue_burst = dlb2_event_dequeue_burst;\n+\t}\n }\n \n int\n",
    "prefixes": [
        "v2",
        "16/22"
    ]
}