/*
 * Copyright 2018 NXP
 * All rights reserved.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "fsl_powerquad.h"

/*******************************************************************************
 * Definitions
 ******************************************************************************/

/* Component ID definition, used by tools. */
#ifndef FSL_COMPONENT_ID
#define FSL_COMPONENT_ID "platform.drivers.powerquad"
#endif

#define PQ_Q31_2_FLOAT(x) (((float)(x)) / 2147483648.0f)
#define PQ_Q15_2_FLOAT(x) (((float)(x)) / 32768.0f)

#define PQ_DECLARE_CONFIG \
    uint32_t outformat; \
    uint32_t inaformat; \
    uint32_t inbformat; \
    uint32_t tmpformat

#define PQ_BACKUP_CONFIG \
    outformat = POWERQUAD_NS->OUTFORMAT; \
    inaformat = POWERQUAD_NS->INAFORMAT; \
    inbformat = POWERQUAD_NS->INBFORMAT; \
    tmpformat = POWERQUAD_NS->TMPFORMAT

#define PQ_RESTORE_CONFIG \
    POWERQUAD_NS->OUTFORMAT = outformat; \
    POWERQUAD_NS->INAFORMAT = inaformat; \
    POWERQUAD_NS->INBFORMAT = inbformat; \
    POWERQUAD_NS->TMPFORMAT = tmpformat

#define PQ_SET_FIX32_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

#define PQ_SET_FIX16_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

#define PQ_SET_Q31_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(-31) << 8U) | ((uint32_t)kPQ_32Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

#define PQ_SET_Q15_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(-15) << 8U) | ((uint32_t)kPQ_16Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

#define PQ_SET_F32_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_Float << 4U) | kPQ_Float; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

#define PQ_SET_FFT_Q31_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

#define PQ_SET_FFT_Q15_CONFIG \
    POWERQUAD_NS->OUTFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->INAFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->INBFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_16Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->TMPFORMAT = ((uint32_t)(0) << 8U)   | ((uint32_t)kPQ_32Bit << 4U) | kPQ_32Bit; \
    POWERQUAD_NS->TMPBASE = 0xE0000000

/*******************************************************************************
 * Code
 ******************************************************************************/
void PQ_GetDefaultConfig(pq_config_t *config)
{
    config->inputAFormat = kPQ_Float;
    config->inputAPrescale = 0;
    config->inputBFormat = kPQ_Float;
    config->inputBPrescale = 0;
    config->outputFormat = kPQ_Float;
    config->outputPrescale = 0;
    config->tmpFormat = kPQ_Float;
    config->tmpPrescale = 0;
    config->machineFormat = kPQ_Float;
}

void PQ_SetConfig(POWERQUAD_Type *base, const pq_config_t *config)
{
    assert(config);

    base->TMPBASE = (q31_t)config->tmpBase;
    base->INAFORMAT = ((uint32_t)config->inputAPrescale << 8U) | ((uint32_t)config->inputAFormat << 4U) | config->machineFormat;
    base->INBFORMAT = ((uint32_t)config->inputBPrescale << 8U) | ((uint32_t)config->inputBFormat << 4U) | config->machineFormat;
    base->TMPFORMAT = ((uint32_t)config->tmpPrescale << 8U) | ((uint32_t)config->tmpFormat << 4U) | config->machineFormat;
    base->OUTFORMAT = ((uint32_t)config->outputPrescale << 8U) | ((uint32_t)config->outputFormat << 4U) | config->machineFormat;
}

void PQ_SetCoprocessorScaler(POWERQUAD_Type *base, const pq_prescale_t *prescale)
{
    assert(prescale);

    base->CPPRE = POWERQUAD_CPPRE_CPPRE_IN(prescale->inputPrescale)
                | POWERQUAD_CPPRE_CPPRE_OUT(prescale->outputPrescale)
                | ((uint32_t)prescale->outputSaturate << POWERQUAD_CPPRE_CPPRE_SAT_SHIFT);
}

void PQ_Init(POWERQUAD_Type *base)
{
#if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
    CLOCK_EnableClock(kCLOCK_PowerQuad);
#endif
#if !(defined(FSL_SDK_DISABLE_DRIVER_RESET_CONTROL) && FSL_SDK_DISABLE_DRIVER_RESET_CONTROL)
    //RESET_PeripheralReset(kPOWERQUAD_RST_SHIFT_RSTn);
#endif

    /* Enable event used for WFE. */
    base->EVENTEN = POWERQUAD_EVENTEN_EVENT_OFLOW_MASK | POWERQUAD_EVENTEN_EVENT_NAN_MASK |
                    POWERQUAD_EVENTEN_EVENT_FIXED_MASK | POWERQUAD_EVENTEN_EVENT_UFLOW_MASK |
                    POWERQUAD_EVENTEN_EVENT_BERR_MASK | POWERQUAD_EVENTEN_EVENT_COMP_MASK;
}

void PQ_Deinit(POWERQUAD_Type *base)
{
#if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
    CLOCK_DisableClock(kCLOCK_PowerQuad);
#endif
}

void PQ_SetFormat(POWERQUAD_Type *base, pq_computationengine_t engine, pq_format_t format)
{
    pq_config_t config;

    PQ_GetDefaultConfig(&config);

    /* 32-bit Float point */
    if (kPQ_Float == format)
    {
        config.inputAFormat = kPQ_Float;
        config.inputAPrescale = 0;
        config.inputBFormat = kPQ_Float;
        config.inputBPrescale = 0;
        config.outputFormat = kPQ_Float;
        config.outputPrescale = 0;
        config.tmpFormat = kPQ_Float;
        config.tmpPrescale = 0;
    }
    /* 32-bit Fixed point */
    if (kPQ_32Bit == format)
    {
        config.inputAFormat = kPQ_32Bit;
        config.inputAPrescale = 0;
        config.inputBFormat = kPQ_32Bit;
        config.inputBPrescale = 0;
        config.outputFormat = kPQ_32Bit;
        config.outputPrescale = 0;
        config.tmpFormat = kPQ_Float;
        config.tmpPrescale = 0;
    }
    /* 16-bit Fixed point */
    if (kPQ_16Bit == format)
    {
        config.inputAFormat = kPQ_16Bit;
        config.inputAPrescale = 0;
        config.inputBFormat = kPQ_16Bit;
        config.inputBPrescale = 0;
        config.outputFormat = kPQ_16Bit;
        config.outputPrescale = 0;
        config.tmpFormat = kPQ_Float;
        config.tmpPrescale = 0;
    }

    if (CP_FFT == engine)
    {
        config.machineFormat = kPQ_32Bit;
    }
    else
    {
        config.machineFormat = kPQ_Float;
    }

    PQ_SetConfig(base, &config);
}

void PQ_WaitDone(POWERQUAD_Type *base)
{
    /* wait for the completion */
    while ((base->CONTROL & INST_BUSY) == INST_BUSY)
    {
        //__WFE(); /* Enter to low power. */
    }
}

void PQ_VectorLnF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_ln0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readAdd0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_LN, 1, PQ_TRANS);
        _pq_end_vector_func();
    }
}

void PQ_VectorInvF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_inv0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readMult0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_INV, 0, PQ_TRANS);
        _pq_end_vector_func();
    }
}

void PQ_VectorSqrtF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_sqrt0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readMult0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_SQRT, 0, PQ_TRANS);
        _pq_end_vector_func();
    }
}

void PQ_VectorInvSqrtF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_invsqrt0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readMult0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_INVSQRT, 0, PQ_TRANS);
        _pq_end_vector_func();
    }
}

void PQ_VectorEtoxF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_etox0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readMult0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_ETOX, 0, PQ_TRANS);
        _pq_end_vector_func();
    }
}

void PQ_VectorEtonxF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_etonx0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readMult0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_ETONX, 0, PQ_TRANS);
        _pq_end_vector_func();
    }
}

void PQ_VectorSinF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_sin0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readAdd0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_SIN, 1, PQ_TRIG);
        _pq_end_vector_func();
    }
}

void PQ_VectorCosF32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_cos0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readAdd0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fp(PQ_COS, 1, PQ_TRIG);
        _pq_end_vector_func();
    }
}

void PQ_VectorLnQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_ln_fx0(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_LN, 1, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorInvQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_inv_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_INV, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorSqrtQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_sqrt_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_SQRT, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorInvSqrtQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_invsqrt_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_INVSQRT, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorEtoxQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_etox_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_ETOX, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorEtonxQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_etonx_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_ETONX, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorSinQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_sin_fx0(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_SIN, 1, PQ_TRIG_FIXED);
        _pq_end_vector_func();
    }

    POWERQUAD_NS->CPPRE = cppre;
}

void PQ_VectorCosQ31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_cos_fx0(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_fx32(PQ_COS, 1, PQ_TRIG_FIXED);
        _pq_end_vector_func();
    }

    POWERQUAD_NS->CPPRE = cppre;
}

void PQ_VectorLnQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_ln_fx0(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_fx16(PQ_LN, 1, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorInvQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_inv_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_fx16(PQ_INV, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorSqrtQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_sqrt_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_fx16(PQ_SQRT, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorInvSqrtQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_invsqrt_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_fx16(PQ_INVSQRT, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorEtoxQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_etox_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_fx16(PQ_ETOX, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorEtonxQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_etonx_fx0(*pSrc++);
            *pDst++ = _pq_readMult0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_fx16(PQ_ETONX, 0, PQ_TRANS_FIXED);
        _pq_end_vector_func();
    }
}

void PQ_VectorSinQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_sin_fx0((*pSrc++) << 16);
            *pDst++ = (_pq_readAdd0_fx()) >> 16;
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_q15(pSrc, pDst, length);
        _pq_vector8_q15(PQ_SIN, 1, PQ_TRIG_FIXED);
        _pq_end_vector_func();
    }

    POWERQUAD_NS->CPPRE = cppre;
}

void PQ_VectorCosQ15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_cos_fx0(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_q15(pSrc, pDst, length);
        _pq_vector8_q15(PQ_COS, 1, PQ_TRIG_FIXED);
        _pq_end_vector_func();
    }

    POWERQUAD_NS->CPPRE = cppre;
}

void PQ_VectorBiqaudDf2F32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_biquad0(*(q31_t *)pSrc++);
            *(q31_t *)pDst++ = _pq_readAdd0();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_biquaddf2_fp();
        _pq_end_vector_func();
    }
}

void PQ_VectorBiqaudDf2Q31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_biquad0_fx(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func(pSrc, pDst, length);
        _pq_vector8_biquaddf2_fx32();
        _pq_end_vector_func();
    }
}

void PQ_VectorBiqaudDf2Q15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;
        while (remainderBy8--)
        {
            _pq_biquad0_fx(*pSrc++);
            *pDst++ = _pq_readAdd0_fx();
        }
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(pSrc, pDst, length);
        _pq_vector8_biquaddf2_fx16();
        _pq_end_vector_func();
    }
}

void PQ_VectorBiqaudCascadeDf2F32(float32_t *pSrc, float32_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;

        PQ_Biquad1F32(&pSrc[0], &pDst[0]);

        for (int i = 1; i < remainderBy8; i++)
        {
            _pq_biquad0(*(q31_t *)&pSrc[i - 1]);
            _pq_biquad1(*(q31_t *)&pSrc[i]);
            *(q31_t *)&pDst[i - 1] = _pq_readAdd0();
            *(q31_t *)&pDst[i] = _pq_readAdd1();
        }

        PQ_BiquadF32(&pSrc[remainderBy8 - 1], &pDst[remainderBy8 - 1]);
    }

    if (length)
    {
        _pq_initiate_vector_func(&pSrc[remainderBy8], &pDst[remainderBy8], length);
        _pq_vector8_biqauddf2cascade_fp();
        _pq_end_vector_func();
    }
}

void PQ_VectorBiqaudCascadeDf2Q31(q31_t *pSrc, q31_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;

        _pq_biquad1_fx(pSrc[0]);
        pDst[0] = _pq_readAdd1_fx();

        for (int i = 1; i < remainderBy8; i++)
        {
            _pq_biquad0_fx(pSrc[i - 1]);
            _pq_biquad1_fx(pSrc[i]);
            pDst[i - 1] = _pq_readAdd0_fx();
            pDst[i] = _pq_readAdd1_fx();
        }

        _pq_biquad0_fx(pSrc[remainderBy8 - 1]);
        pDst[remainderBy8 - 1] = _pq_readAdd0_fx();
    }

    if (length)
    {
        _pq_initiate_vector_func(&pSrc[remainderBy8], &pDst[remainderBy8], length);
        _pq_vector8_biqauddf2cascade_fx32();
        _pq_end_vector_func();
    }
}

void PQ_VectorBiqaudCascadeDf2Q15(q15_t *pSrc, q15_t *pDst, q31_t length)
{
    q31_t remainderBy8 = length % 8;

    if (remainderBy8)
    {
        length -= remainderBy8;

        _pq_biquad1_fx(pSrc[0]);
        pDst[0] = _pq_readAdd1_fx();

        for (int i = 1; i < remainderBy8; i++)
        {
            _pq_biquad0_fx(pSrc[i - 1]);
            _pq_biquad1_fx(pSrc[i]);
            pDst[i - 1] = _pq_readAdd0_fx();
            pDst[i] = _pq_readAdd1_fx();
        }

        _pq_biquad0_fx(pSrc[remainderBy8 - 1]);
        pDst[remainderBy8 - 1] = _pq_readAdd0_fx();
    }

    if (length)
    {
        _pq_initiate_vector_func_fx16(&pSrc[remainderBy8], &pDst[remainderBy8], length);
        _pq_vector8_biqauddf2cascade_fx16();
        _pq_end_vector_func();
    }
}

q31_t PQ_ArctanFixed(POWERQUAD_Type *base, q31_t x, q31_t y, pq_cordic_iter_t iteration)
{
    base->CORDIC_X = x;
    base->CORDIC_Y = y;
    base->CORDIC_Z = 0;
    base->CONTROL = (CP_CORDIC << 4) | CORDIC_ARCTAN | CORDIC_ITER(iteration);

    PQ_WaitDone(base);
    return base->CORDIC_Z;
}

q31_t PQ_ArctanhFixed(POWERQUAD_Type *base, q31_t x, q31_t y, pq_cordic_iter_t iteration)
{
    base->CORDIC_X = x;
    base->CORDIC_Y = y;
    base->CORDIC_Z = 0;
    base->CONTROL = (CP_CORDIC << 4) | CORDIC_ARCTANH | CORDIC_ITER(iteration);

    PQ_WaitDone(base);
    return base->CORDIC_Z;
}

void PQ_TransformCFFT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_FFT << 4) | PQ_TRANS_CFFT;
}

void PQ_TransformRFFT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    /* Set 0's for imaginary inputs as not be reading them in by the machine */
    base->GPREG[1] = 0;
    base->GPREG[3] = 0;
    base->GPREG[5] = 0;
    base->GPREG[7] = 0;
    base->GPREG[9] = 0;
    base->GPREG[11] = 0;
    base->GPREG[13] = 0;
    base->GPREG[15] = 0;
    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_FFT << 4) | PQ_TRANS_RFFT;
}

void PQ_TransformIFFT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_FFT << 4) | PQ_TRANS_IFFT;
}

void PQ_TransformCDCT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_FFT << 4) | PQ_TRANS_CDCT;
}

void PQ_TransformRDCT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->GPREG[1] = 0;
    base->GPREG[3] = 0;
    base->GPREG[5] = 0;
    base->GPREG[7] = 0;
    base->GPREG[9] = 0;
    base->GPREG[11] = 0;
    base->GPREG[13] = 0;
    base->GPREG[15] = 0;
    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_FFT << 4) | PQ_TRANS_RDCT;
}

void PQ_TransformIDCT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_FFT << 4) | PQ_TRANS_IDCT;
}

void PQ_BiquadBackUpInternalState(POWERQUAD_Type *base, int32_t biquad_num, pq_biquad_state_t *state)
{
    if (0 == biquad_num)
    {
        state->param = *(volatile pq_biquad_param_t *)&base->GPREG[0];
        state->compreg = base->COMPREG[1];
    }
    else
    {
        state->param = *(volatile pq_biquad_param_t *)&base->GPREG[8];
        state->compreg = base->COMPREG[3];
    }
}

void PQ_BiquadRestoreInternalState(POWERQUAD_Type *base, int32_t biquad_num, pq_biquad_state_t *state)
{
    if (0 == biquad_num)
    {
        *(volatile pq_biquad_param_t *)&base->GPREG[0] = state->param;
        base->COMPREG[1] = state->compreg;
    }
    else
    {
        *(volatile pq_biquad_param_t *)&base->GPREG[8] = state->param;
        base->COMPREG[3] = state->compreg;
    }
}

void PQ_FIR(
    POWERQUAD_Type *base, void *pAData, int32_t ALength, void *pBData, int32_t BLength, void *pResult, uint32_t opType)
{
    assert(pAData);
    assert(pBData);
    assert(pResult);

    base->INABASE = (uint32_t)pAData;
    base->INBBASE = (uint32_t)pBData;
    base->LENGTH = ((uint32_t)BLength << 16U) + (uint32_t)ALength;
    base->OUTBASE = (uint32_t)pResult;
    base->CONTROL = (CP_FIR << 4U) | opType;
}

void PQ_FIRIncrement(POWERQUAD_Type *base, int32_t ALength, int32_t BLength, int32_t xOffset, uint32_t opType)
{
    base->MISC = xOffset;
    base->LENGTH = ((uint32_t)BLength << 16) + (uint32_t)ALength;
    base->CONTROL = (CP_FIR << 4) | opType | PQ_FIR_INCREMENTAL;
}

void PQ_FIRIncrementAlt(POWERQUAD_Type *base, void *pAData, int32_t ALength, void *pResult, uint32_t opType)
{
    base->MISC = 0;
    base->INABASE = (uint32_t)pAData;
    base->LENGTH = (base->LENGTH & 0xFFFF0000) * (uint32_t)ALength;
    base->OUTBASE = (uint32_t)pResult;
    base->CONTROL = (CP_FIR << 4) | opType | PQ_FIR_INCREMENTAL;
}

void PQ_MatrixAddition(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult)
{
    assert(pAData);
    assert(pBData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pAData;
    base->INBBASE = (int32_t)pBData;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_ADD;
}

void PQ_MatrixSubtraction(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult)
{
    assert(pAData);
    assert(pBData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pAData;
    base->INBBASE = (int32_t)pBData;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_SUB;
}

void PQ_MatrixMultiplication(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult)
{
    assert(pAData);
    assert(pBData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pAData;
    base->INBBASE = (int32_t)pBData;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_MULT;
}

void PQ_MatrixProduct(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult)
{
    assert(pAData);
    assert(pBData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pAData;
    base->INBBASE = (int32_t)pBData;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_PROD;
}

void PQ_VectorDotProduct(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult)
{
    assert(pAData);
    assert(pBData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pAData;
    base->INBBASE = (int32_t)pBData;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_VEC_DOTP;
}

void PQ_MatrixInversion(POWERQUAD_Type *base, uint32_t length, void *pData, void *pTmpData, void *pResult)
{
    assert(pData);
    assert(pTmpData);
    assert(pResult);

    base->INABASE = (uint32_t)pData;
    base->TMPBASE = (uint32_t)pTmpData;
    base->OUTBASE = (uint32_t)pResult;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_INV;
}

void PQ_MatrixTranspose(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_TRAN;
}

void PQ_MatrixScale(POWERQUAD_Type *base, uint32_t length, float misc, void *pData, void *pResult)
{
    assert(pData);
    assert(pResult);

    base->OUTBASE = (int32_t)pResult;
    base->INABASE = (int32_t)pData;
    base->LENGTH = length;
    base->MISC = *(uint32_t *)&misc;
    base->CONTROL = (CP_MTX << 4) | PQ_MTX_SCALE;
}

float32_t arm_cos_f32(float32_t x)
{
    float tmp;

    PQ_CosF32(&x, &tmp);
    return tmp;
}

q31_t arm_cos_q31(q31_t x)
{
    /* For PQ: input -1 to 1 means -pi to pi
     * For CMSIS DSP: input 0 to 1 means pi to 2*pi */
    x *= 2;
    return PQ_CosQ31(x);
}

q15_t arm_cos_q15(q15_t x)
{
    /* For PQ: input -1 to 1 means -pi to pi
     * For CMSIS DSP: input 0 to 1 means pi to 2*pi */
    x *= 2;
    return PQ_CosQ15(x);
}

float32_t arm_sin_f32(float32_t x)
{
    float tmp;

    PQ_SinF32(&x, &tmp);
    return tmp;
}

q31_t arm_sin_q31(q31_t x)
{
    /* For PQ: input -1 to 1 means -pi to pi
     * For CMSIS DSP: input 0 to 1 means pi to 2*pi */
    x *= 2;
    return PQ_SinQ31(x);
}

q15_t arm_sin_q15(q15_t x)
{
    /* For PQ: input -1 to 1 means -pi to pi
     * For CMSIS DSP: input 0 to 1 means pi to 2*pi */
    x *= 2;
    return PQ_SinQ15(x);
}

arm_status arm_sqrt_q31(q31_t in, q31_t *pOut)
{
    uint32_t cppre;

    /* If the input is a positive number then compute the signBits. */
    if (in > 0)
    {
        cppre = POWERQUAD_NS->CPPRE;
        POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_IN(-31) | POWERQUAD_CPPRE_CPPRE_OUT(31);
        *pOut = PQ_SqrtFixed(in);
        POWERQUAD_NS->CPPRE = cppre;

        return (ARM_MATH_SUCCESS);
    }
    /* If the number is a negative number then store zero as its square root value */
    else
    {
        *pOut = 0;

        return (ARM_MATH_ARGUMENT_ERROR);
    }
}

arm_status arm_sqrt_q15(q15_t in, q15_t *pOut)
{
    uint32_t cppre;

    /* If the input is a positive number then compute the signBits. */
    if (in > 0)
    {
        cppre = POWERQUAD_NS->CPPRE;
        POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_IN(-15) | POWERQUAD_CPPRE_CPPRE_OUT(15);
        *pOut = PQ_SqrtFixed(in);
        POWERQUAD_NS->CPPRE = cppre;

        return (ARM_MATH_SUCCESS);
    }
    /* If the number is a negative number then store zero as its square root value */
    else
    {
        *pOut = 0;

        return (ARM_MATH_ARGUMENT_ERROR);
    }
}

void arm_cfft_q31(const arm_cfft_instance_q31 *S, q31_t *p1, uint8_t ifftFlag, uint8_t bitReverseFlag)
{
    assert(bitReverseFlag == 1);

    q31_t *pIn = p1;
    q31_t *pOut = p1;
    uint32_t length = S->fftLen;

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_FFT_Q31_CONFIG;

    if (ifftFlag == 1U)
    {
        PQ_TransformIFFT(POWERQUAD_NS, length, pIn, pOut);
    }
    else
    {
        PQ_TransformCFFT(POWERQUAD_NS, length, pIn, pOut);
    }

    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_cfft_q15(const arm_cfft_instance_q15 *S, q15_t *p1, uint8_t ifftFlag, uint8_t bitReverseFlag)
{
    assert(bitReverseFlag == 1);

    q15_t *pIn = p1;
    q15_t *pOut = p1;
    uint32_t length = S->fftLen;

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_FFT_Q15_CONFIG;

    if (ifftFlag == 1U)
    {
        PQ_TransformIFFT(POWERQUAD_NS, length, pIn, pOut);
    }
    else
    {
        PQ_TransformCFFT(POWERQUAD_NS, length, pIn, pOut);
    }

    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

arm_status arm_rfft_init_q31(arm_rfft_instance_q31 *S, uint32_t fftLenReal, uint32_t ifftFlagR, uint32_t bitReverseFlag)
{
    /* Only supprt such mode. */
    assert(ifftFlagR == 0);
    assert(bitReverseFlag == 1);

    /*  Initialise the default arm status */
    arm_status status = ARM_MATH_SUCCESS;

    /*  Initialize the Real FFT length */
    S->fftLenReal = (uint16_t)fftLenReal;

    /*  Initialize the Flag for selection of RFFT or RIFFT */
    S->ifftFlagR = (uint8_t)ifftFlagR;

    /*  Initialize the Flag for calculation Bit reversal or not */
    S->bitReverseFlagR = (uint8_t)bitReverseFlag;

    /* return the status of RFFT Init function */
    return (status);
}

void arm_rfft_q31(const arm_rfft_instance_q31 *S, q31_t *pSrc, q31_t *pDst)
{
    uint32_t length = S->fftLenReal;
    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_FFT_Q31_CONFIG;

    /* Calculation of RFFT of input */
    PQ_TransformRFFT(POWERQUAD_NS, length, pSrc, pDst);

    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

arm_status arm_rfft_init_q15(arm_rfft_instance_q15 *S, uint32_t fftLenReal, uint32_t ifftFlagR, uint32_t bitReverseFlag)
{
    /* Only supprt such mode. */
    assert(ifftFlagR == 0);
    assert(bitReverseFlag == 1);

    /*  Initialise the default arm status */
    arm_status status = ARM_MATH_SUCCESS;

    /*  Initialize the Real FFT length */
    S->fftLenReal = (uint16_t)fftLenReal;

    /*  Initialize the Flag for selection of RFFT or RIFFT */
    S->ifftFlagR = (uint8_t)ifftFlagR;

    /*  Initialize the Flag for calculation Bit reversal or not */
    S->bitReverseFlagR = (uint8_t)bitReverseFlag;

    /* return the status of RFFT Init function */
    return (status);
}

void arm_rfft_q15(const arm_rfft_instance_q15 *S, q15_t *pSrc, q15_t *pDst)
{
    uint32_t length = S->fftLenReal;
    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_FFT_Q15_CONFIG;

    /* Calculation of RFFT of input */
    PQ_TransformRFFT(POWERQUAD_NS, length, pSrc, pDst);

    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

arm_status arm_dct4_init_q31(arm_dct4_instance_q31 *S,
                             arm_rfft_instance_q31 *S_RFFT,
                             arm_cfft_radix4_instance_q31 *S_CFFT,
                             uint16_t N,
                             uint16_t Nby2,
                             q31_t normalize)
{
    /*  Initialise the default arm status */
    arm_status status = ARM_MATH_SUCCESS;

    /* Initialize the DCT4 length */
    S->N = N;

    /* Initialize Real FFT Instance */
    S->pRfft = S_RFFT;

    switch (N)
    {
        /* Initialize the table modifier values */
        case 512U:
            S->pTwiddle = dct512_twiddle;

        case 256U:
            S->pTwiddle = dct256_twiddle;

        case 128U:
            S->pTwiddle = dct128_twiddle;

            break;
        case 64U:
            S->pTwiddle = dct64_twiddle;

            break;
        case 32U:
            S->pTwiddle = dct32_twiddle;

            break;
        case 16U:
            S->pTwiddle = dct16_twiddle;

            break;
        default:
            status = ARM_MATH_ARGUMENT_ERROR;
    }

    /* Initialize the RFFT/RIFFT Function */
    arm_rfft_init_q31(S->pRfft, S->N, 0, 1);

    /* return the status of DCT4 Init function */
    return (status);
}

void arm_dct4_q31(const arm_dct4_instance_q31 *S, q31_t *pState, q31_t *pInlineBuffer)
{
    /* Calculate DCT-II for N-point input */
    uint16_t i;                   /* Loop counter */
    q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */
    q31_t *pIn = pInlineBuffer;   /* Temporary pointers for input buffer */
    q31_t *pOut = pState;         /* Temporary pointers for output buffer */
    q31_t *pS1, *pbuff;           /* Temporary pointers for input buffer and pState buffer */
    q31_t in;                     /* Temporary variable */
    int64_t acc0;
    int64_t acc1;
    int64_t acc2;

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;

    PQ_SET_FFT_Q31_CONFIG;

    PQ_TransformRDCT(POWERQUAD_NS, S->N, pIn, pOut);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;

    for (int i = 0; i < S->N; i++)
    {
        acc0 = (int64_t)pOut[i * 2] * weights[i * 2];             /* real * real */
        acc1 = (int64_t)pOut[(i * 2) + 1] * weights[(i * 2) + 1]; /* imaginary * imaginary */
        acc2 = acc0 - acc1;
        pOut[i * 2] = (uint32_t)(acc2 * 2/ (1024 * 1024 * 16));
    }

    /* ----------- Post-processing ---------- */
    /* DCT-IV can be obtained from DCT-II by the equation,
     *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
     *       Hence, Y4(0) = Y2(0)/2  */
    /* Getting only real part from the output and Converting to DCT-IV */

    /* pbuff initialized to input buffer. */
    pbuff = pInlineBuffer;

    /* pS1 initialized to pState */
    pS1 = pState;

    /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2. Considering the DCT II normalize, here divided by sqrt(2).*/
    in = (q31_t)((float)*pS1++ / 1.41421356237f);
    /* input buffer acts as inplace, so output values are stored in the input itself. */
    *pbuff++ = in;

    /* pState pointer is incremented twice as the real values are located alternatively in the array */
    pS1++;

    /* Initializing the loop counter */
    i = (S->N - 1U);

    while (i > 0U)
    {
        /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
        /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
        in = *pS1++ - in;
        *pbuff++ = in;
        /* points to the next real value */
        pS1++;

        /* Decrement the loop counter */
        i--;
    }
}

arm_status arm_dct4_init_q15(arm_dct4_instance_q15 *S,
                             arm_rfft_instance_q15 *S_RFFT,
                             arm_cfft_radix4_instance_q15 *S_CFFT,
                             uint16_t N,
                             uint16_t Nby2,
                             q15_t normalize)
{
    /*  Initialise the default arm status */
    arm_status status = ARM_MATH_SUCCESS;

    /* Initialize the DCT4 length */
    S->N = N;

    /* Initialize Real FFT Instance */
    S->pRfft = S_RFFT;

    switch (N)
    {
        /* Initialize the table modifier values */
        case 512U:
            S->pTwiddle = (q15_t *)dct512_twiddle;

        case 256U:
            S->pTwiddle = (q15_t *)dct256_twiddle;

        case 128U:
            S->pTwiddle = (q15_t *)dct128_twiddle;

            break;
        case 64U:
            S->pTwiddle = (q15_t *)dct64_twiddle;

            break;
        case 32U:
            S->pTwiddle = (q15_t *)dct32_twiddle;

            break;
        case 16U:
            S->pTwiddle = (q15_t *)dct16_twiddle;

            break;
        default:
            status = ARM_MATH_ARGUMENT_ERROR;
    }

    /* Initialize the RFFT/RIFFT Function */
    arm_rfft_init_q15(S->pRfft, S->N, 0, 1);

    /* return the status of DCT4 Init function */
    return (status);
}

void arm_dct4_q15(const arm_dct4_instance_q15 *S, q15_t *pState, q15_t *pInlineBuffer)
{
    /* Calculate DCT-II for N-point input */
    uint16_t i;                            /* Loop counter */
    q31_t *weights = (q31_t *)S->pTwiddle; /* Pointer to the Weights table */
    q15_t *pIn = pInlineBuffer;            /* Temporary pointers for input buffer */
    q15_t *pOut = pState;                  /* Temporary pointers for output buffer */
    q15_t *pS1, *pbuff;                    /* Temporary pointers for input buffer and pState buffer */
    q15_t in;                              /* Temporary variable */
    int64_t acc0;
    int64_t acc1;
    int64_t acc2;

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;

    PQ_SET_FFT_Q15_CONFIG;

    PQ_TransformRDCT(POWERQUAD_NS, S->N, pIn, pOut);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;

    for (int i = 0; i < S->N; i++)
    {
        acc0 = (int64_t)pOut[i * 2] * weights[i * 2];             /* real * real */
        acc1 = (int64_t)pOut[(i * 2) + 1] * weights[(i * 2) + 1]; /* imaginary * imaginary */
        acc2 = acc0 - acc1;
        pOut[i * 2] = (uint32_t)(acc2 * 2 / (1024 * 1024 * 16));
    }

    /* ----------- Post-processing ---------- */
    /* DCT-IV can be obtained from DCT-II by the equation,
     *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
     *       Hence, Y4(0) = Y2(0)/2  */
    /* Getting only real part from the output and Converting to DCT-IV */

    /* pbuff initialized to input buffer. */
    pbuff = pInlineBuffer;

    /* pS1 initialized to pState */
    pS1 = pState;

    /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2. Considering the DCT II normalize, here divided by sqrt(2).*/
    in = (q15_t)((float)*pS1++ / 1.41421356237f);
    /* input buffer acts as inplace, so output values are stored in the input itself. */
    *pbuff++ = in;

    /* pState pointer is incremented twice as the real values are located alternatively in the array */
    pS1++;

    /* Initializing the loop counter */
    i = (S->N - 1U);

    while (i > 0U)
    {
        /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
        /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
        in = *pS1++ - in;
        *pbuff++ = in;
        /* points to the next real value */
        pS1++;

        /* Decrement the loop counter */
        i--;
    }
}

void PQ_BiquadCascadeDf2Init(pq_biquad_cascade_df2_instance *S, uint8_t numStages, pq_biquad_state_t *pState)
{
    S->numStages = numStages;
    S->pState = pState;
}

void PQ_BiquadCascadeDf2F32(const pq_biquad_cascade_df2_instance *S,
                            float32_t *pSrc,
                            float32_t *pDst,
                            uint32_t blockSize)
{
    uint32_t stage = S->numStages;
    pq_prescale_t prescale;
    pq_biquad_state_t *states = S->pState;

    memcpy(pDst, pSrc, blockSize);

    prescale.inputPrescale = 0;
    prescale.outputPrescale = 0;
    prescale.outputSaturate = 0;

    PQ_SetCoprocessorScaler(POWERQUAD_NS, &prescale);

    if (stage % 2 != 0)
    {
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 0, states);

        PQ_VectorBiqaudDf2F32(pSrc, pDst, blockSize);

        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 0, states);

        states++;
        stage--;
    }

    do
    {
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 1, states);
        states++;
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 0, states);

        PQ_VectorBiqaudCascadeDf2F32(pDst, pDst, blockSize);

        states--;
        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 1, states);
        states++;
        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 0, states);

        states++;
        stage -= 2U;

    } while (stage > 0U);
}

void PQ_BiquadCascadeDf2Q31(const pq_biquad_cascade_df2_instance *S, q31_t *pSrc, q31_t *pDst, uint32_t blockSize)
{
    uint32_t stage = S->numStages;
    pq_prescale_t prescale;
    pq_biquad_state_t *states = S->pState;

    memcpy(pDst, pSrc, blockSize);

    prescale.inputPrescale = 0;
    prescale.outputPrescale = 0;
    prescale.outputSaturate = 0;

    PQ_SetCoprocessorScaler(POWERQUAD_NS, &prescale);

    if (stage % 2 != 0)
    {
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 0, states);

        PQ_VectorBiqaudDf2Q31(pSrc, pDst, blockSize);

        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 0, states);

        states++;
        stage--;
    }

    do
    {
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 0, states);
        states++;
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 1, states);

        PQ_VectorBiqaudCascadeDf2Q31(pDst, pDst, blockSize);

        states--;
        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 0, states);
        states++;
        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 1, states);

        states++;
        stage -= 2U;
    } while (stage > 0U);
}

void PQ_BiquadCascadeDf2Q15(const pq_biquad_cascade_df2_instance *S, q15_t *pSrc, q15_t *pDst, uint32_t blockSize)
{
    uint32_t stage = S->numStages;
    pq_prescale_t prescale;
    pq_biquad_state_t *states = S->pState;

    memcpy(pDst, pSrc, blockSize);

    prescale.inputPrescale = 0;
    prescale.outputPrescale = 0;
    prescale.outputSaturate = 0;

    PQ_SetCoprocessorScaler(POWERQUAD_NS, &prescale);

    if (stage % 2 != 0)
    {
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 0, states);

        PQ_VectorBiqaudDf2Q15(pSrc, pDst, blockSize);

        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 0, states);

        states++;
        stage--;
    }

    do
    {
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 0, states);
        states++;
        PQ_BiquadRestoreInternalState(POWERQUAD_NS, 1, states);

        PQ_VectorBiqaudCascadeDf2Q15(pDst, pDst, blockSize);

        states--;
        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 0, states);
        states++;
        PQ_BiquadBackUpInternalState(POWERQUAD_NS, 1, states);

        states++;
        stage -= 2U;
    } while (stage > 0U);
}

void arm_fir_init_f32(
    arm_fir_instance_f32 *S, uint16_t numTaps, float32_t *pCoeffs, float32_t *pState, uint32_t blockSize)
{
    uint32_t i;

    /*
     * CMSIS DSP API filter coefficients stored in time reversed order, but PQ
     * uses the positive order. PQ does not use pState, so pState pState[1:numTaps]
     * is used here to save the coefficients in positive order. At the same time,
     * pState[0] is used to indicate whether this is the incremetal calculation.
     * Because the length of pState is (numTaps + blockSize -1), to ensure enough space,
     * the blockSize should be larger than 1.
     */
    assert(blockSize > 1);

    S->numTaps = numTaps;
    S->pCoeffs = pCoeffs;
    S->pState = pState;

    for (i=0; i<numTaps; i++)
    {
        pState[numTaps-i] = pCoeffs[i];
    }

    pState[0] = 0.0f;
}

void arm_fir_init_q31(arm_fir_instance_q31 *S, uint16_t numTaps, q31_t *pCoeffs, q31_t *pState, uint32_t blockSize)
{
    uint32_t i;

    /*
     * CMSIS DSP API filter coefficients stored in time reversed order, but PQ
     * uses the positive order. PQ does not use pState, so pState pState[1:numTaps]
     * is used here to save the coefficients in positive order. At the same time,
     * pState[0] is used to indicate whether this is the incremetal calculation.
     * Because the length of pState is (numTaps + blockSize -1), to ensure enough space,
     * the blockSize should be larger than 1.
     */
    assert(blockSize > 1);

    S->numTaps = numTaps;
    S->pCoeffs = pCoeffs;
    S->pState = pState;

    for (i=0; i<numTaps; i++)
    {
        pState[numTaps-i] = pCoeffs[i];
    }

    pState[0] = 0;
}

arm_status arm_fir_init_q15(
    arm_fir_instance_q15 *S, uint16_t numTaps, q15_t *pCoeffs, q15_t *pState, uint32_t blockSize)
{
    uint32_t i;

    /*
     * CMSIS DSP API filter coefficients stored in time reversed order, but PQ
     * uses the positive order. PQ does not use pState, so pState pState[1:numTaps]
     * is used here to save the coefficients in positive order. At the same time,
     * pState[0] is used to indicate whether this is the incremetal calculation.
     * Because the length of pState is (numTaps + blockSize -1), to ensure enough space,
     * the blockSize should be larger than 1.
     */
    assert(blockSize > 1);

    S->numTaps = numTaps;
    S->pCoeffs = pCoeffs;
    S->pState = pState;

    for (i=0; i<numTaps; i++)
    {
        pState[numTaps-i] = pCoeffs[i];
    }

    pState[0] = 0;

    return ARM_MATH_SUCCESS;
}

void arm_fir_f32(const arm_fir_instance_f32 *S, float32_t *pSrc, float32_t *pDst, uint32_t blockSize)
{
    assert(S);
    assert(pSrc);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_F32_CONFIG;

    if (S->pState[0] == 0.0)
    {
        PQ_FIR(POWERQUAD_NS, pSrc, blockSize, &(S->pState[1]), S->numTaps, pDst, PQ_FIR_FIR);
    }
    else
    {
        PQ_FIRIncrementAlt(POWERQUAD_NS, pSrc, blockSize, pDst, PQ_FIR_FIR);
    }
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_fir_q31(const arm_fir_instance_q31 *S, q31_t *pSrc, q31_t *pDst, uint32_t blockSize)
{
    assert(S);
    assert(pSrc);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_Q31_CONFIG;

    if (S->pState[0] == 0)
    {
        PQ_FIR(POWERQUAD_NS, pSrc, blockSize, &(S->pState[1]), S->numTaps, pDst, PQ_FIR_FIR);
    }
    else
    {
        PQ_FIRIncrementAlt(POWERQUAD_NS, pSrc, blockSize, pDst, PQ_FIR_FIR);
    }
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_fir_q15(const arm_fir_instance_q15 *S, q15_t *pSrc, q15_t *pDst, uint32_t blockSize)
{
    assert(S);
    assert(pSrc);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_Q15_CONFIG;

    if (S->pState[0] == 0)
    {
        PQ_FIR(POWERQUAD_NS, pSrc, blockSize, &(S->pState[1]), S->numTaps, pDst, PQ_FIR_FIR);
    }
    else
    {
        PQ_FIRIncrementAlt(POWERQUAD_NS, pSrc, blockSize, pDst, PQ_FIR_FIR);
    }
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_conv_f32(float32_t *pSrcA, uint32_t srcALen, float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_F32_CONFIG;

    PQ_FIR(POWERQUAD_NS, pSrcA, srcALen, pSrcB, srcBLen, pDst, PQ_FIR_CONVOLUTION);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_conv_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_Q31_CONFIG;

    PQ_FIR(POWERQUAD_NS, pSrcA, srcALen, pSrcB, srcBLen, pDst, PQ_FIR_CONVOLUTION);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_conv_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_Q15_CONFIG;

    PQ_FIR(POWERQUAD_NS, pSrcA, srcALen, pSrcB, srcBLen, pDst, PQ_FIR_CONVOLUTION);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_correlate_f32(float32_t *pSrcA, uint32_t srcALen, float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_F32_CONFIG;

    PQ_FIR(POWERQUAD_NS, pSrcA, srcALen, pSrcB, srcBLen, pDst, PQ_FIR_CORRELATION);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_correlate_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_Q31_CONFIG;

    PQ_FIR(POWERQUAD_NS, pSrcA, srcALen, pSrcB, srcBLen, pDst, PQ_FIR_CORRELATION);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_correlate_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    PQ_DECLARE_CONFIG;
    PQ_BACKUP_CONFIG;
    PQ_SET_Q15_CONFIG;

    PQ_FIR(POWERQUAD_NS, pSrcA, srcALen, pSrcB, srcBLen, pDst, PQ_FIR_CORRELATION);
    PQ_WaitDone(POWERQUAD_NS);

    PQ_RESTORE_CONFIG;
}

void arm_mat_init_f32(arm_matrix_instance_f32 *S, uint16_t nRows, uint16_t nColumns, float32_t *pData)
{
    /* Assign Number of Rows */
    S->numRows = nRows;

    /* Assign Number of Columns */
    S->numCols = nColumns;

    /* Assign Data pointer */
    S->pData = pData;
}

void arm_mat_init_q31(arm_matrix_instance_q31 *S, uint16_t nRows, uint16_t nColumns, q31_t *pData)
{
    /* Assign Number of Rows */
    S->numRows = nRows;

    /* Assign Number of Columns */
    S->numCols = nColumns;

    /* Assign Data pointer */
    S->pData = pData;
}

void arm_mat_init_q15(arm_matrix_instance_q15 *S, uint16_t nRows, uint16_t nColumns, q15_t *pData)
{
    /* Assign Number of Rows */
    S->numRows = nRows;

    /* Assign Number of Columns */
    S->numCols = nColumns;

    /* Assign Data pointer */
    S->pData = pData;
}

arm_status arm_mat_add_f32(const arm_matrix_instance_f32 *pSrcA,
                           const arm_matrix_instance_f32 *pSrcB,
                           arm_matrix_instance_f32 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_F32_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixAddition(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_add_q31(const arm_matrix_instance_q31 *pSrcA,
                           const arm_matrix_instance_q31 *pSrcB,
                           arm_matrix_instance_q31 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_FIX32_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixAddition(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_add_q15(const arm_matrix_instance_q15 *pSrcA,
                           const arm_matrix_instance_q15 *pSrcB,
                           arm_matrix_instance_q15 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_FIX16_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixAddition(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_sub_f32(const arm_matrix_instance_f32 *pSrcA,
                           const arm_matrix_instance_f32 *pSrcB,
                           arm_matrix_instance_f32 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_F32_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixSubtraction(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_sub_q31(const arm_matrix_instance_q31 *pSrcA,
                           const arm_matrix_instance_q31 *pSrcB,
                           arm_matrix_instance_q31 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_FIX32_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixSubtraction(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_sub_q15(const arm_matrix_instance_q15 *pSrcA,
                           const arm_matrix_instance_q15 *pSrcB,
                           arm_matrix_instance_q15 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_FIX16_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixSubtraction(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_mult_f32(const arm_matrix_instance_f32 *pSrcA,
                            const arm_matrix_instance_f32 *pSrcB,
                            arm_matrix_instance_f32 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_F32_CONFIG;

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_MatrixMultiplication(POWERQUAD_NS, length, pSrcA->pData, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_mult_q31(const arm_matrix_instance_q31 *pSrcA,
                            const arm_matrix_instance_q31 *pSrcB,
                            arm_matrix_instance_q31 *pDst)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

    /*
     * The output prescale does not supprt negative value due to hardware issue,
     * workaround:
     * 1. Downscale the matrix A and save the float output value to private memory.
     * 2. Multiply the float matrix A in private memory with matrix B, output as Q31.
     */
    pq_config_t config =
    {
        .inputAFormat = kPQ_32Bit,
        .inputAPrescale = 0,
        .inputBFormat = kPQ_32Bit,
        .inputBPrescale = 0,
        .outputFormat = kPQ_Float,
        .outputPrescale = 0,
        .tmpFormat = kPQ_Float,
        .tmpPrescale = 0,
        .machineFormat = kPQ_Float,
        .tmpBase = (void*)0xe0000000,
    };

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;

        length = (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_SetConfig(POWERQUAD_NS, &config);

        /* Downscale. */
        PQ_MatrixScale(POWERQUAD_NS, length, 1.0f/2147483648.0f, pSrcA->pData, (void*)0xE0000000);

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        config.inputAFormat = kPQ_Float;
        config.outputFormat = kPQ_32Bit;

        PQ_WaitDone(POWERQUAD_NS);

        PQ_SetConfig(POWERQUAD_NS, &config);

        PQ_MatrixMultiplication(POWERQUAD_NS, length, (void*)0xE0000000, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_mult_q15(const arm_matrix_instance_q15 *pSrcA,
                            const arm_matrix_instance_q15 *pSrcB,
                            arm_matrix_instance_q15 *pDst,
                            q15_t *pState)
{
    assert(pSrcA);
    assert(pSrcB);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

    pq_config_t config =
    {
        .inputAFormat = kPQ_16Bit,
        .inputAPrescale = 0,
        .inputBFormat = kPQ_16Bit,
        .inputBPrescale = 0,
        .outputFormat = kPQ_Float,
        .outputPrescale = 0,
        .tmpFormat = kPQ_Float,
        .tmpPrescale = 0,
        .machineFormat = kPQ_Float,
        .tmpBase = (void*)0xe0000000,
    };

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrcA->numRows != pSrcB->numRows) || (pSrcA->numCols != pSrcB->numCols) || (pSrcA->numRows != pDst->numRows) ||
        (pSrcA->numCols != pDst->numCols))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;

        length = (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        PQ_SetConfig(POWERQUAD_NS, &config);

        /* Downscale. */
        PQ_MatrixScale(POWERQUAD_NS, length, 1.0f/32768.0f, pSrcA->pData, (void*)0xE0000000);

        length = (pSrcB->numCols << 16) | (pSrcA->numCols << 8) | (pSrcA->numRows << 0);

        config.inputAFormat = kPQ_Float;
        config.outputFormat = kPQ_16Bit;

        PQ_WaitDone(POWERQUAD_NS);

        PQ_SetConfig(POWERQUAD_NS, &config);

        PQ_MatrixMultiplication(POWERQUAD_NS, length, (void*)0xE0000000, pSrcB->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_inverse_f32(const arm_matrix_instance_f32 *pSrc, arm_matrix_instance_f32 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;
    float tmp[1024];
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_F32_CONFIG;

        length = (pSrc->numRows << 16) | (pSrc->numRows << 8) | (pSrc->numRows << 0);

        PQ_MatrixInversion(POWERQUAD_NS, length, pSrc->pData, tmp, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_trans_f32(const arm_matrix_instance_f32 *pSrc, arm_matrix_instance_f32 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        length = (pSrc->numCols << 8) | (pSrc->numRows << 0);

        PQ_SetFormat(POWERQUAD_NS, kPQ_CP_MTX, kPQ_Float);

        PQ_MatrixTranspose(POWERQUAD_NS, length, pSrc->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);
        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_trans_q31(const arm_matrix_instance_q31 *pSrc, arm_matrix_instance_q31 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_FIX32_CONFIG;

        length = (pSrc->numCols << 8) | (pSrc->numRows << 0);

        PQ_MatrixTranspose(POWERQUAD_NS, length, pSrc->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_trans_q15(const arm_matrix_instance_q15 *pSrc, arm_matrix_instance_q15 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_FIX16_CONFIG;

        length = (pSrc->numCols << 8) | (pSrc->numRows << 0);

        PQ_MatrixTranspose(POWERQUAD_NS, length, pSrc->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_scale_f32(const arm_matrix_instance_f32 *pSrc, float32_t scale, arm_matrix_instance_f32 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;
    PQ_DECLARE_CONFIG;

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;
        PQ_SET_F32_CONFIG;

        length = (pSrc->numCols << 8) | (pSrc->numRows << 0);

        PQ_MatrixScale(POWERQUAD_NS, length, scale, pSrc->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_scale_q31(const arm_matrix_instance_q31 *pSrc,
                             q31_t scale,
                             int32_t shift,
                             arm_matrix_instance_q31 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;
    float scaleFloat;
    PQ_DECLARE_CONFIG;

    const pq_config_t config =
    {
        .inputAFormat = kPQ_32Bit,
        .inputAPrescale = 0,
        .inputBFormat = kPQ_32Bit,
        .inputBPrescale = 0,
        .outputFormat = kPQ_32Bit,
        .outputPrescale = shift,
        .tmpFormat = kPQ_Float,
        .tmpPrescale = 0,
        .machineFormat = kPQ_Float,
        .tmpBase = (void*)0xe0000000,
    };

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;

        length = (pSrc->numCols << 8) | (pSrc->numRows << 0);

        scaleFloat = PQ_Q31_2_FLOAT(scale);

        PQ_SetConfig(POWERQUAD_NS, &config);

        PQ_MatrixScale(POWERQUAD_NS, length, scaleFloat, pSrc->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}

arm_status arm_mat_scale_q15(const arm_matrix_instance_q15 *pSrc,
                             q15_t scale,
                             int32_t shift,
                             arm_matrix_instance_q15 *pDst)
{
    assert(pSrc);
    assert(pDst);

    arm_status status;
    q31_t length;
    float scaleFloat;
    PQ_DECLARE_CONFIG;

    const pq_config_t config =
    {
        .inputAFormat = kPQ_16Bit,
        .inputAPrescale = 0,
        .inputBFormat = kPQ_16Bit,
        .inputBPrescale = 0,
        .outputFormat = kPQ_16Bit,
        .outputPrescale = shift,
        .tmpFormat = kPQ_Float,
        .tmpPrescale = 0,
        .machineFormat = kPQ_Float,
        .tmpBase = (void*)0xe0000000,
    };

#ifdef ARM_MATH_MATRIX_CHECK
    /* Check for matrix mismatch condition */
    if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols) || (pSrc->numRows != pDst->numRows))
    {
        /* Set status as ARM_MATH_SIZE_MISMATCH */
        status = ARM_MATH_SIZE_MISMATCH;
    }
    else
#endif
    {
        PQ_BACKUP_CONFIG;

        length = (pSrc->numCols << 8) | (pSrc->numRows << 0);

        scaleFloat = PQ_Q15_2_FLOAT(scale);

        PQ_SetConfig(POWERQUAD_NS, &config);

        PQ_MatrixScale(POWERQUAD_NS, length, scaleFloat, pSrc->pData, pDst->pData);

        /* Wait for the completion */
        PQ_WaitDone(POWERQUAD_NS);

        PQ_RESTORE_CONFIG;

        status = ARM_MATH_SUCCESS;
    }

    /* Return to application */
    return status;
}
