[v2,4/7] baseband/acc: allocate FCW memory separately

Message ID 20230921204349.3285318-5-nicolas.chautru@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series VRB2 bbdev PMD introduction |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Chautru, Nicolas Sept. 21, 2023, 8:43 p.m. UTC
  This allows more flexibility to the FCW size for the
unified driver. No actual functional change.

Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
---
 drivers/baseband/acc/acc_common.h  |  4 +++-
 drivers/baseband/acc/rte_vrb_pmd.c | 25 ++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)
  

Comments

Maxime Coquelin Sept. 27, 2023, 8:28 a.m. UTC | #1
On 9/21/23 22:43, Nicolas Chautru wrote:
> This allows more flexibility to the FCW size for the
> unified driver. No actual functional change.
> 
> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> ---
>   drivers/baseband/acc/acc_common.h  |  4 +++-
>   drivers/baseband/acc/rte_vrb_pmd.c | 25 ++++++++++++++++++++++++-
>   2 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/baseband/acc/acc_common.h b/drivers/baseband/acc/acc_common.h
> index df18506e75..b5ee113faf 100644
> --- a/drivers/baseband/acc/acc_common.h
> +++ b/drivers/baseband/acc/acc_common.h
> @@ -101,6 +101,7 @@
>   #define ACC_NUM_QGRPS_PER_WORD         8
>   #define ACC_MAX_NUM_QGRPS              32
>   #define ACC_RING_SIZE_GRANULARITY      64
> +#define ACC_MAX_FCW_SIZE              128
>   
>   /* Constants from K0 computation from 3GPP 38.212 Table 5.4.2.1-2 */
>   #define ACC_N_ZC_1 66 /* N = 66 Zc for BG 1 */
> @@ -582,13 +583,14 @@ struct __rte_cache_aligned acc_queue {
>   	uint32_t aq_enqueued;  /* Count how many "batches" have been enqueued */
>   	uint32_t aq_dequeued;  /* Count how many "batches" have been dequeued */
>   	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
> -	struct rte_mempool *fcw_mempool;  /* FCW mempool */
>   	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD */
>   	/* Internal Buffers for loopback input */
>   	uint8_t *lb_in;
>   	uint8_t *lb_out;
> +	uint8_t *fcw_ring;
>   	rte_iova_t lb_in_addr_iova;
>   	rte_iova_t lb_out_addr_iova;
> +	rte_iova_t fcw_ring_addr_iova;
>   	int8_t *derm_buffer; /* interim buffer for de-rm in SDK */
>   	struct acc_device *d;
>   };
> diff --git a/drivers/baseband/acc/rte_vrb_pmd.c b/drivers/baseband/acc/rte_vrb_pmd.c
> index 6898a0f802..f460e9ea2a 100644
> --- a/drivers/baseband/acc/rte_vrb_pmd.c
> +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> @@ -883,6 +883,25 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
>   		goto free_companion_ring_addr;
>   	}
>   
> +	q->fcw_ring = rte_zmalloc_socket(dev->device->driver->name,
> +			ACC_MAX_FCW_SIZE * d->sw_ring_max_depth,
> +			RTE_CACHE_LINE_SIZE, conf->socket);
> +	if (q->fcw_ring == NULL) {
> +		rte_bbdev_log(ERR, "Failed to allocate fcw_ring memory");
> +		ret = -ENOMEM;
> +		goto free_companion_ring_addr;
> +	}
> +	q->fcw_ring_addr_iova = rte_malloc_virt2iova(q->fcw_ring);

I see this is not done for other rte_malloc_virt2iova(), but this API 
may returns an error (RTE_BAD_IOVA), so it should be checked.

> +
> +	/* For FFT we need to store the FCW separately */
> +	if (conf->op_type == RTE_BBDEV_OP_FFT) {
> +		for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> +			desc = q->ring_addr + desc_idx;
> +			desc->req.data_ptrs[0].address = q->fcw_ring_addr_iova +
> +					desc_idx * ACC_MAX_FCW_SIZE;
> +		}
> +	}
> +
>   	q->qgrp_id = (q_idx >> VRB1_GRP_ID_SHIFT) & 0xF;
>   	q->vf_id = (q_idx >> VRB1_VF_ID_SHIFT)  & 0x3F;
>   	q->aq_id = q_idx & 0xF;
> @@ -994,6 +1013,7 @@ vrb_queue_release(struct rte_bbdev *dev, uint16_t q_id)
>   	if (q != NULL) {
>   		/* Mark the Queue as un-assigned. */
>   		d->q_assigned_bit_map[q->qgrp_id] &= (~0ULL - (1 << (uint64_t) q->aq_id));
> +		rte_free(q->fcw_ring);
>   		rte_free(q->companion_ring_addr);
>   		rte_free(q->lb_in);
>   		rte_free(q->lb_out);
> @@ -3224,7 +3244,10 @@ vrb_enqueue_fft_one_op(struct acc_queue *q, struct rte_bbdev_fft_op *op,
>   	output = op->fft.base_output.data;
>   	in_offset = op->fft.base_input.offset;
>   	out_offset = op->fft.base_output.offset;
> -	fcw = &desc->req.fcw_fft;
> +
> +	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
> +			((q->sw_ring_head + total_enqueued_cbs) & q->sw_ring_wrap_mask)
> +			* ACC_MAX_FCW_SIZE);
>   
>   	vrb1_fcw_fft_fill(op, fcw);
>   	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset, &out_offset);

With above suggested check added, the patch looks good to me.

Thanks,
Maxime
  
Chautru, Nicolas Sept. 28, 2023, 8:33 p.m. UTC | #2
Hi Maxime, 

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Wednesday, September 27, 2023 1:28 AM
> To: Chautru, Nicolas <nicolas.chautru@intel.com>; dev@dpdk.org
> Cc: hemant.agrawal@nxp.com; david.marchand@redhat.com; Vargas, Hernan
> <hernan.vargas@intel.com>
> Subject: Re: [PATCH v2 4/7] baseband/acc: allocate FCW memory separately
> 
> 
> 
> On 9/21/23 22:43, Nicolas Chautru wrote:
> > This allows more flexibility to the FCW size for the unified driver.
> > No actual functional change.
> >
> > Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> > ---
> >   drivers/baseband/acc/acc_common.h  |  4 +++-
> >   drivers/baseband/acc/rte_vrb_pmd.c | 25 ++++++++++++++++++++++++-
> >   2 files changed, 27 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/baseband/acc/acc_common.h
> > b/drivers/baseband/acc/acc_common.h
> > index df18506e75..b5ee113faf 100644
> > --- a/drivers/baseband/acc/acc_common.h
> > +++ b/drivers/baseband/acc/acc_common.h
> > @@ -101,6 +101,7 @@
> >   #define ACC_NUM_QGRPS_PER_WORD         8
> >   #define ACC_MAX_NUM_QGRPS              32
> >   #define ACC_RING_SIZE_GRANULARITY      64
> > +#define ACC_MAX_FCW_SIZE              128
> >
> >   /* Constants from K0 computation from 3GPP 38.212 Table 5.4.2.1-2 */
> >   #define ACC_N_ZC_1 66 /* N = 66 Zc for BG 1 */ @@ -582,13 +583,14 @@
> > struct __rte_cache_aligned acc_queue {
> >   	uint32_t aq_enqueued;  /* Count how many "batches" have been
> enqueued */
> >   	uint32_t aq_dequeued;  /* Count how many "batches" have been
> dequeued */
> >   	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
> > -	struct rte_mempool *fcw_mempool;  /* FCW mempool */
> >   	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD
> */
> >   	/* Internal Buffers for loopback input */
> >   	uint8_t *lb_in;
> >   	uint8_t *lb_out;
> > +	uint8_t *fcw_ring;
> >   	rte_iova_t lb_in_addr_iova;
> >   	rte_iova_t lb_out_addr_iova;
> > +	rte_iova_t fcw_ring_addr_iova;
> >   	int8_t *derm_buffer; /* interim buffer for de-rm in SDK */
> >   	struct acc_device *d;
> >   };
> > diff --git a/drivers/baseband/acc/rte_vrb_pmd.c
> > b/drivers/baseband/acc/rte_vrb_pmd.c
> > index 6898a0f802..f460e9ea2a 100644
> > --- a/drivers/baseband/acc/rte_vrb_pmd.c
> > +++ b/drivers/baseband/acc/rte_vrb_pmd.c
> > @@ -883,6 +883,25 @@ vrb_queue_setup(struct rte_bbdev *dev, uint16_t
> queue_id,
> >   		goto free_companion_ring_addr;
> >   	}
> >
> > +	q->fcw_ring = rte_zmalloc_socket(dev->device->driver->name,
> > +			ACC_MAX_FCW_SIZE * d->sw_ring_max_depth,
> > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > +	if (q->fcw_ring == NULL) {
> > +		rte_bbdev_log(ERR, "Failed to allocate fcw_ring memory");
> > +		ret = -ENOMEM;
> > +		goto free_companion_ring_addr;
> > +	}
> > +	q->fcw_ring_addr_iova = rte_malloc_virt2iova(q->fcw_ring);
> 
> I see this is not done for other rte_malloc_virt2iova(), but this API may returns
> an error (RTE_BAD_IOVA), so it should be checked.

But can it really fail? I don't think so, we just got the ptr from the malloc etc...
Also we don’t do this anywhere (including out of baseband/drivers) when doing such basic virt2iova from pinned memory.
If we need to add it eventually (I doubt it) probably best to do it everywhere to keep code consistency.
Suggesting to keep this one as is if okay with you. 

> 
> > +
> > +	/* For FFT we need to store the FCW separately */
> > +	if (conf->op_type == RTE_BBDEV_OP_FFT) {
> > +		for (desc_idx = 0; desc_idx < d->sw_ring_max_depth;
> desc_idx++) {
> > +			desc = q->ring_addr + desc_idx;
> > +			desc->req.data_ptrs[0].address = q-
> >fcw_ring_addr_iova +
> > +					desc_idx * ACC_MAX_FCW_SIZE;
> > +		}
> > +	}
> > +
> >   	q->qgrp_id = (q_idx >> VRB1_GRP_ID_SHIFT) & 0xF;
> >   	q->vf_id = (q_idx >> VRB1_VF_ID_SHIFT)  & 0x3F;
> >   	q->aq_id = q_idx & 0xF;
> > @@ -994,6 +1013,7 @@ vrb_queue_release(struct rte_bbdev *dev, uint16_t
> q_id)
> >   	if (q != NULL) {
> >   		/* Mark the Queue as un-assigned. */
> >   		d->q_assigned_bit_map[q->qgrp_id] &= (~0ULL - (1 <<
> (uint64_t)
> > q->aq_id));
> > +		rte_free(q->fcw_ring);
> >   		rte_free(q->companion_ring_addr);
> >   		rte_free(q->lb_in);
> >   		rte_free(q->lb_out);
> > @@ -3224,7 +3244,10 @@ vrb_enqueue_fft_one_op(struct acc_queue *q,
> struct rte_bbdev_fft_op *op,
> >   	output = op->fft.base_output.data;
> >   	in_offset = op->fft.base_input.offset;
> >   	out_offset = op->fft.base_output.offset;
> > -	fcw = &desc->req.fcw_fft;
> > +
> > +	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
> > +			((q->sw_ring_head + total_enqueued_cbs) & q-
> >sw_ring_wrap_mask)
> > +			* ACC_MAX_FCW_SIZE);
> >
> >   	vrb1_fcw_fft_fill(op, fcw);
> >   	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset,
> > &out_offset);
> 
> With above suggested check added, the patch looks good to me.
> 
> Thanks,
> Maxime
  

Patch

diff --git a/drivers/baseband/acc/acc_common.h b/drivers/baseband/acc/acc_common.h
index df18506e75..b5ee113faf 100644
--- a/drivers/baseband/acc/acc_common.h
+++ b/drivers/baseband/acc/acc_common.h
@@ -101,6 +101,7 @@ 
 #define ACC_NUM_QGRPS_PER_WORD         8
 #define ACC_MAX_NUM_QGRPS              32
 #define ACC_RING_SIZE_GRANULARITY      64
+#define ACC_MAX_FCW_SIZE              128
 
 /* Constants from K0 computation from 3GPP 38.212 Table 5.4.2.1-2 */
 #define ACC_N_ZC_1 66 /* N = 66 Zc for BG 1 */
@@ -582,13 +583,14 @@  struct __rte_cache_aligned acc_queue {
 	uint32_t aq_enqueued;  /* Count how many "batches" have been enqueued */
 	uint32_t aq_dequeued;  /* Count how many "batches" have been dequeued */
 	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
-	struct rte_mempool *fcw_mempool;  /* FCW mempool */
 	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD */
 	/* Internal Buffers for loopback input */
 	uint8_t *lb_in;
 	uint8_t *lb_out;
+	uint8_t *fcw_ring;
 	rte_iova_t lb_in_addr_iova;
 	rte_iova_t lb_out_addr_iova;
+	rte_iova_t fcw_ring_addr_iova;
 	int8_t *derm_buffer; /* interim buffer for de-rm in SDK */
 	struct acc_device *d;
 };
diff --git a/drivers/baseband/acc/rte_vrb_pmd.c b/drivers/baseband/acc/rte_vrb_pmd.c
index 6898a0f802..f460e9ea2a 100644
--- a/drivers/baseband/acc/rte_vrb_pmd.c
+++ b/drivers/baseband/acc/rte_vrb_pmd.c
@@ -883,6 +883,25 @@  vrb_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
 		goto free_companion_ring_addr;
 	}
 
+	q->fcw_ring = rte_zmalloc_socket(dev->device->driver->name,
+			ACC_MAX_FCW_SIZE * d->sw_ring_max_depth,
+			RTE_CACHE_LINE_SIZE, conf->socket);
+	if (q->fcw_ring == NULL) {
+		rte_bbdev_log(ERR, "Failed to allocate fcw_ring memory");
+		ret = -ENOMEM;
+		goto free_companion_ring_addr;
+	}
+	q->fcw_ring_addr_iova = rte_malloc_virt2iova(q->fcw_ring);
+
+	/* For FFT we need to store the FCW separately */
+	if (conf->op_type == RTE_BBDEV_OP_FFT) {
+		for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
+			desc = q->ring_addr + desc_idx;
+			desc->req.data_ptrs[0].address = q->fcw_ring_addr_iova +
+					desc_idx * ACC_MAX_FCW_SIZE;
+		}
+	}
+
 	q->qgrp_id = (q_idx >> VRB1_GRP_ID_SHIFT) & 0xF;
 	q->vf_id = (q_idx >> VRB1_VF_ID_SHIFT)  & 0x3F;
 	q->aq_id = q_idx & 0xF;
@@ -994,6 +1013,7 @@  vrb_queue_release(struct rte_bbdev *dev, uint16_t q_id)
 	if (q != NULL) {
 		/* Mark the Queue as un-assigned. */
 		d->q_assigned_bit_map[q->qgrp_id] &= (~0ULL - (1 << (uint64_t) q->aq_id));
+		rte_free(q->fcw_ring);
 		rte_free(q->companion_ring_addr);
 		rte_free(q->lb_in);
 		rte_free(q->lb_out);
@@ -3224,7 +3244,10 @@  vrb_enqueue_fft_one_op(struct acc_queue *q, struct rte_bbdev_fft_op *op,
 	output = op->fft.base_output.data;
 	in_offset = op->fft.base_input.offset;
 	out_offset = op->fft.base_output.offset;
-	fcw = &desc->req.fcw_fft;
+
+	fcw = (struct acc_fcw_fft *) (q->fcw_ring +
+			((q->sw_ring_head + total_enqueued_cbs) & q->sw_ring_wrap_mask)
+			* ACC_MAX_FCW_SIZE);
 
 	vrb1_fcw_fft_fill(op, fcw);
 	vrb1_dma_desc_fft_fill(op, &desc->req, input, output, &in_offset, &out_offset);