/*
 * Copyright 2018 NXP
 * All rights reserved.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#ifndef _FSL_POWERQUAD_H_
#define _FSL_POWERQUAD_H_
#define __pq_TARGET_COPROC_V8
#define __pq_ACLE 200

#if defined(__CC_ARM)

#elif defined(__ICCARM__)
#include <intrinsics.h>
#elif defined(__GNUC__)

#endif /* defined(__CC_ARM) */

#include "fsl_common.h"
#include "arm_math.h"
#include "fsl_twiddlefactor.h"

/*!
 * @addtogroup powerequad
 * @{
 */


/* powerquad validation codeжݴ洢ռ */
typedef union
{
    float    x; /* . */
    uint32_t for_cp; /* . */
} PQFLT;


/*******************************************************************************
 * Definitions
 ******************************************************************************/

/*! @name Driver version */
/*@{*/
#define FSL_POWERQUAD_DRIVER_VERSION (MAKE_VERSION(2, 0, 0)) /*!< Version 2.0.0. */
/*@}*/

#define PQ_FLOAT32 0U
#define PQ_FIXEDPT 1U

#define CP_PQ 0U
#define CP_MTX 1U
#define CP_FFT 2U
#define CP_FIR 3U
#define CP_CORDIC 5U

#define PQ_TRANS 0U
#define PQ_TRIG 1U
#define PQ_BIQUAD 2U

#define PQ_TRANS_FIXED 4U
#define PQ_TRIG_FIXED 5U
#define PQ_BIQUAD_FIXED 6U

#define PQ_INV 0U
#define PQ_LN 1U
#define PQ_SQRT 2U
#define PQ_INVSQRT 3U
#define PQ_ETOX 4U
#define PQ_ETONX 5U
#define PQ_DIV 6U

#define PQ_SIN 0U
#define PQ_COS 1U

#define PQ_BIQ0_CALC 1U
#define PQ_BIQ1_CALC 1U

#define PQ_COMP0_ONLY (0U << 1)
#define PQ_COMP1_ONLY (1U << 1)

#define CORDIC_ITER(x) (x << 2)
#define CORDIC_MIU(x) (x << 1)
#define CORDIC_T(x) (x << 0)
#define CORDIC_ARCTAN CORDIC_T(1) | CORDIC_MIU(0)
#define CORDIC_ARCTANH CORDIC_T(1) | CORDIC_MIU(1)

#define INST_BUSY 0x80000000U

#define PQ_ERRSTAT_OVERFLOW 0U
#define PQ_ERRSTAT_NAN 1U
#define PQ_ERRSTAT_FIXEDOVERFLOW 2U
#define PQ_ERRSTAT_UNDERFLOW 3U

#define PQ_TRANS_CFFT 0U
#define PQ_TRANS_IFFT 1U
#define PQ_TRANS_CDCT 2U
#define PQ_TRANS_IDCT 3U
#define PQ_TRANS_RFFT 4U
#define PQ_TRANS_RDCT 6U

#define PQ_MTX_SCALE 1U
#define PQ_MTX_MULT 2U
#define PQ_MTX_ADD 3U
#define PQ_MTX_INV 4U
#define PQ_MTX_PROD 5U
#define PQ_MTX_SUB 7U
#define PQ_VEC_DOTP 9U
#define PQ_MTX_TRAN 10U

/* FIR engine operation type */
#define PQ_FIR_FIR 0U
#define PQ_FIR_CONVOLUTION 1U
#define PQ_FIR_CORRELATION 2U
#define PQ_FIR_INCREMENTAL 4U

#define _pq_ln0(x) __arm_mcr(CP_PQ, PQ_LN, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRANS)
#define _pq_inv0(x) __arm_mcr(CP_PQ, PQ_INV, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRANS)
#define _pq_sqrt0(x) __arm_mcr(CP_PQ, PQ_SQRT, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRANS)
#define _pq_invsqrt0(x) __arm_mcr(CP_PQ, PQ_INVSQRT, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRANS)
#define _pq_etox0(x) __arm_mcr(CP_PQ, PQ_ETOX, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRANS)
#define _pq_etonx0(x) __arm_mcr(CP_PQ, PQ_ETONX, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRANS)
#define _pq_sin0(x) __arm_mcr(CP_PQ, PQ_SIN, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRIG)
#define _pq_cos0(x) __arm_mcr(CP_PQ, PQ_COS, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_TRIG)
#define _pq_biquad0(x) __arm_mcr(CP_PQ, PQ_BIQ0_CALC, x, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, PQ_BIQUAD)

#define _pq_ln_fx0(x) __arm_mcr(CP_PQ, PQ_LN, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_inv_fx0(x) __arm_mcr(CP_PQ, PQ_INV, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_sqrt_fx0(x) __arm_mcr(CP_PQ, PQ_SQRT, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_invsqrt_fx0(x) __arm_mcr(CP_PQ, PQ_INVSQRT, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_etox_fx0(x) __arm_mcr(CP_PQ, PQ_ETOX, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_etonx_fx0(x) __arm_mcr(CP_PQ, PQ_ETONX, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_sin_fx0(x) __arm_mcr(CP_PQ, PQ_SIN, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRIG_FIXED)
#define _pq_cos_fx0(x) __arm_mcr(CP_PQ, PQ_COS, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_TRIG_FIXED)
#define _pq_biquad0_fx(x) __arm_mcr(CP_PQ, PQ_BIQ0_CALC, x, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, PQ_BIQUAD_FIXED)

#define _pq_div0(x) __arm_mcrr(CP_PQ, PQ_FLOAT32 | PQ_COMP0_ONLY, x, PQ_DIV)
#define _pq_div1(x) __arm_mcrr(CP_PQ, PQ_FLOAT32 | PQ_COMP1_ONLY, x, PQ_DIV)

#define _pq_ln1(x) __arm_mcr(CP_PQ, PQ_LN, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRANS)
#define _pq_inv1(x) __arm_mcr(CP_PQ, PQ_INV, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRANS)
#define _pq_sqrt1(x) __arm_mcr(CP_PQ, PQ_SQRT, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRANS)
#define _pq_invsqrt1(x) __arm_mcr(CP_PQ, PQ_INVSQRT, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRANS)
#define _pq_etox1(x) __arm_mcr(CP_PQ, PQ_ETOX, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRANS)
#define _pq_etonx1(x) __arm_mcr(CP_PQ, PQ_ETONX, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRANS)
#define _pq_sin1(x) __arm_mcr(CP_PQ, PQ_SIN, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRIG)
#define _pq_cos1(x) __arm_mcr(CP_PQ, PQ_COS, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_TRIG)
#define _pq_biquad1(x) __arm_mcr(CP_PQ, PQ_BIQ1_CALC, x, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, PQ_BIQUAD)

#define _pq_ln_fx1(x) __arm_mcr(CP_PQ, PQ_LN, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_inv_fx1(x) __arm_mcr(CP_PQ, PQ_INV, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_sqrt_fx1(x) __arm_mcr(CP_PQ, PQ_SQRT, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_invsqrt_fx1(x) __arm_mcr(CP_PQ, PQ_INVSQRT, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_etox_fx1(x) __arm_mcr(CP_PQ, PQ_ETOX, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_etonx_fx1(x) __arm_mcr(CP_PQ, PQ_ETONX, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRANS_FIXED)
#define _pq_sin_fx1(x) __arm_mcr(CP_PQ, PQ_SIN, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRIG_FIXED)
#define _pq_cos_fx1(x) __arm_mcr(CP_PQ, PQ_COS, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_TRIG_FIXED)
#define _pq_biquad1_fx(x) __arm_mcr(CP_PQ, PQ_BIQ1_CALC, x, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, PQ_BIQUAD_FIXED)

#define _pq_readMult0() __arm_mrc(CP_PQ, 0, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, 0)
#define _pq_readAdd0() __arm_mrc(CP_PQ, 1, PQ_FLOAT32 | PQ_COMP0_ONLY, 0, 0)
#define _pq_readMult1() __arm_mrc(CP_PQ, 0, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, 0)
#define _pq_readAdd1() __arm_mrc(CP_PQ, 1, PQ_FLOAT32 | PQ_COMP1_ONLY, 0, 0)
#define _pq_readMult0_fx() __arm_mrc(CP_PQ, 0, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, 0)
#define _pq_readAdd0_fx() __arm_mrc(CP_PQ, 1, PQ_FIXEDPT | PQ_COMP0_ONLY, 0, 0)
#define _pq_readMult1_fx() __arm_mrc(CP_PQ, 0, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, 0)
#define _pq_readAdd1_fx() __arm_mrc(CP_PQ, 1, PQ_FIXEDPT | PQ_COMP1_ONLY, 0, 0)

/* r0: pSrc, r1: pDest, r2: length, r3: middle, r4-r9: Data, r10:dra */
#define _pq_initiate_vector_func(PSRC, PDST, LENGTH)     \
    __asm volatile(                                      \
        "MOV r0, %[psrc]         \n"                     \
        "MOV r1, %[pdst]         \n"                     \
        "MOV r2, %[length]       \n"                     \
        "PUSH {r3-r10}           \n"                     \
        "MOV r3, #0              \n"                     \
        "MOV r10, #0             \n"                     \
        "LDRD r4,r5,[r0],#8      \n" ::[psrc] "r"(PSRC), \
        [pdst] "r"(PDST), [length] "r"(LENGTH))

#define _pq_initiate_vector_func_fx16(PSRC, PDST, LENGTH) \
    __asm volatile(                                       \
        "MOV r0, %[psrc]          \n"                     \
        "MOV r1, %[pdst]          \n"                     \
        "MOV r2, %[length]        \n"                     \
        "PUSH {r3-r10}            \n"                     \
        "MOV r3, #0               \n"                     \
        "LDRSH r4,[r0],#2         \n"                     \
        "LDRSH r5,[r0],#2         \n" ::[psrc] "r"(PSRC), \
        [pdst] "r"(PDST), [length] "r"(LENGTH))

#define _pq_initiate_vector_func_q15(PSRC, PDST, LENGTH)  \
    __asm volatile(                                       \
        "MOV r0, %[psrc]          \n"                     \
        "MOV r1, %[pdst]          \n"                     \
        "MOV r2, %[length]        \n"                     \
        "PUSH {r3-r10}            \n"                     \
        "MOV r3, #0               \n"                     \
        "LDR r5,[r0],#4           \n"                     \
        "LSL r4,r5,#16            \n"                     \
        "BFC r5,#0,#16            \n" ::[psrc] "r"(PSRC), \
        [pdst] "r"(PDST), [length] "r"(LENGTH))

#define _pq_end_vector_func() __asm volatile("POP {r3-r10}            \n")

#define _pq_vector8_fp(BATCH_OPCODE, DOUBLE_READ_ADDERS, BATCH_MACHINE)                    \
    __asm volatile(                                                                        \
        "loop:                                     \n"                                     \
        "    MCR  p0,%[opcode],r5,c2,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r4,c0,c0,%[machine] \n"                                     \
        "    CMP  r3, #0                           \n"                                     \
        "    ITE  NE                               \n"                                     \
        "    STRDNE r6,r7,[r1],#8                  \n" /* store fourth two results */      \
        "    MOVEQ r3, #1                          \n" /* middle = 1 */                    \
        "    LDMIA  r0!,{r6-r9}                    \n" /* load next 4 datas */             \
        "    MOV  r10,%[dra]                       \n"                                     \
        "    CMP  r10, #0                          \n"                                     \
        "    ITE  NE                               \n"                                     \
        "    MRRCNE  p0,#0,r4,r5,c1                \n"                                     \
        "    MRRCEQ  p0,#0,r4,r5,c0                \n"                                     \
        "    MCR  p0,%[opcode],r7,c2,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r6,c0,c0,%[machine] \n"                                     \
        "    STRD r4,r5,[r1],#8                    \n" /* store first two results */       \
        "    MOV  r10,%[dra]                       \n"                                     \
        "    CMP  r10, #0                          \n"                                     \
        "    ITE  NE                               \n"                                     \
        "    MRRCNE  p0,#0,r6,r7,c1                \n"                                     \
        "    MRRCEQ  p0,#0,r6,r7,c0                \n"                                     \
        "    MCR  p0,%[opcode],r9,c2,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r8,c0,c0,%[machine] \n"                                     \
        "    STRD r6,r7,[r1],#8                    \n" /* store second two results */      \
        "    LDRD r6,r7,[r0],#8                    \n" /* load last 2 of the 8 */          \
        "    CMP  r10, #0                          \n"                                     \
        "    ITE  NE                               \n"                                     \
        "    MRRCNE  p0,#0,r8,r9,c1                \n"                                     \
        "    MRRCEQ  p0,#0,r8,r9,c0                \n"                                     \
        "    MCR  p0,%[opcode],r7,c2,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r6,c0,c0,%[machine] \n"                                     \
        "    STRD r8,r9,[r1],#8                    \n" /* store third two results */       \
        "    SUBS r2, r2, #8                       \n" /* length -= 8; if (length != 0) */ \
        "    IT   NE                               \n"                                     \
        "    LDRDNE r4,r5,[r0],#8                  \n" /* load first two of next 8 */      \
        "    CMP  r10, #0                          \n"                                     \
        "    ITE  NE                               \n"                                     \
        "    MRRCNE  p0,#0,r6,r7,c1                \n"                                     \
        "    MRRCEQ  p0,#0,r6,r7,c0                \n"                                     \
        "    CMP  r2, #0                           \n" /* if (length == 0) */              \
        "    BNE  loop                             \n"                                     \
        "    STRD r6,r7,[r1],#8                    \n" /* store fourth two results */      \
        ::[opcode] "i"(BATCH_OPCODE),                                                      \
        [dra] "i"(DOUBLE_READ_ADDERS)[machine] "i"(BATCH_MACHINE))

#define _pq_vector8_fx32(BATCH_OPCODE, DOUBLE_READ_ADDERS, BATCH_MACHINE)                  \
    __asm volatile(                                                                        \
        "loop:                                     \n"                                     \
        "    MCR  p0,%[opcode],r4,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r5,c3,c0,%[machine] \n"                                     \
        "    CMP  r3, #0                           \n"                                     \
        "    ITE  NE                               \n"                                     \
        "    STRDNE r6,r7,[r1],#8                  \n" /* store fourth two results */      \
        "    MOVEQ r3, #1                          \n" /* middle = 1 */                    \
        "    LDMIA  r0!,{r6-r9}                    \n" /* load next 4 datas */             \
        "    MRC  p0,%[dra],r4,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r5,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r6,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r7,c3,c0,%[machine] \n"                                     \
        "    STRD r4,r5,[r1],#8                    \n" /* store first two results */       \
        "    MRC  p0,%[dra],r6,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r7,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r8,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r9,c3,c0,%[machine] \n"                                     \
        "    STRD r6,r7,[r1],#8                    \n" /* store second two results */      \
        "    LDRD r6,r7,[r0],#8                    \n" /* load last 2 of the 8 */          \
        "    MRC  p0,%[dra],r8,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r9,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r6,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r7,c3,c0,%[machine] \n"                                     \
        "    STRD r8,r9,[r1],#8                    \n" /* store third two results */       \
        "    SUBS r2, r2, #8                       \n" /* length -= 8; if (length != 0) */ \
        "    IT   NE                               \n"                                     \
        "    LDRDNE r4,r5,[r0],#8                  \n" /* load first two of next 8 */      \
        "    MRC  p0,%[dra],r6,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r7,c3,c0,#0            \n"                                     \
        "    CMP  r2, #0                           \n" /* if (length == 0) */              \
        "    BNE  loop                             \n"                                     \
        "    STRD r6,r7,[r1],#8                    \n" /* store fourth two results */      \
        ::[opcode] "i"(BATCH_OPCODE),                                                      \
        [dra] "i"(DOUBLE_READ_ADDERS), [machine] "i"(BATCH_MACHINE))

#define _pq_vector8_fx16(BATCH_OPCODE, DOUBLE_READ_ADDERS, BATCH_MACHINE)                  \
    __asm volatile(                                                                        \
        "loop:                                     \n"                                     \
        "    MCR  p0,%[opcode],r4,c1,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r5,c3,c0,%[machine] \n"                                     \
        "    CMP  r3, #0                           \n"                                     \
        "    ITTE NE                               \n"                                     \
        "    STRHNE r6,[r1],#2                     \n" /* store fourth two results */      \
        "    STRHNE r7,[r1],#2                     \n" /* store fourth two results */      \
        "    MOVEQ r3, #1                          \n" /* middle = 1 */                    \
        "    LDRSH r6,[r0],#2                      \n" /* load next 2 of the 8 */          \
        "    LDRSH r7,[r0],#2                      \n" /* load next 2 of the 8 */          \
        "    MRC  p0,%[dra],r4,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r5,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r6,c1,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r7,c3,c0,%[machine] \n"                                     \
        "    STRH r4,[r1],#2                       \n" /* store first two results */       \
        "    STRH r5,[r1],#2                       \n" /* store first two results */       \
        "    LDRSH r8,[r0],#2                      \n" /* load next 2 of the 8 */          \
        "    LDRSH r9,[r0],#2                      \n" /* load next 2 of the 8 */          \
        "    MRC  p0,%[dra],r6,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r7,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r8,c1,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r9,c3,c0,%[machine] \n"                                     \
        "    STRH r6,[r1],#2                       \n"  /* store second two results */     \
        "    STRH r7,[r1],#2                       \n"  /* store second two results */     \
        "    LDRSH r6,[r0],#2                       \n" /* load last 2 of the 8 */         \
        "    LDRSH r7,[r0],#2                       \n" /* load last 2 of the 8 */         \
        "    MRC  p0,%[dra],r8,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r9,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r6,c1,c0,%[machine] \n"                                     \
        "    MCR  p0,%[opcode],r7,c3,c0,%[machine] \n"                                     \
        "    STRH r8,[r1],#2                       \n" /* store third two results */       \
        "    STRH r9,[r1],#2                       \n" /* store third two results */       \
        "    SUBS r2, r2, #8                       \n" /* length -= 8; if (length != 0) */ \
        "    ITT  NE                               \n"                                     \
        "    LDRSHNE r4,[r0],#2                    \n" /* load first two of next 8 */      \
        "    LDRSHNE r5,[r0],#2                    \n" /* load first two of next 8 */      \
        "    MRC  p0,%[dra],r6,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r7,c3,c0,#0            \n"                                     \
        "    CMP  r2, #0                           \n" /* if (length == 0) */              \
        "    BNE  loop                             \n"                                     \
        "    STRH r6,[r1],#2                       \n" /* store fourth two results */      \
        "    STRH r7,[r1],#2                       \n" /* store fourth two results */      \
        ::[opcode] "i"(BATCH_OPCODE),                                                      \
        [dra] "i"(DOUBLE_READ_ADDERS), [machine] "i"(BATCH_MACHINE))

/* Load Q15 and left shift 16 bits, calculate and right shift 16 bits, then store. */
#define _pq_vector8_q15(BATCH_OPCODE, DOUBLE_READ_ADDERS, BATCH_MACHINE)                   \
    __asm volatile(                                                                        \
        "loop:                                     \n"                                     \
        "    MCR  p0,%[opcode],r4,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r5,c3,c0,%[machine] \n"                                     \
        "    CMP  r3, #0                           \n"                                     \
        "    ITTTE NE                              \n"                                     \
        "    LSRNE r6,r6,#16                       \n" /* store fourth two results */      \
        "    BFINE r7,r6,#0,#16                    \n" /* store fourth two results */      \
        "    STRNE r7,[r1],#4                      \n" /* store fourth two results */      \
        "    MOVEQ r3, #1                          \n" /* middle = 1 */                    \
        "    LDR r7,[r0],#4                        \n" /* load next 2 of the 8 */          \
        "    LSL r6,r7,#16                         \n" /* load next 2 of the 8 */          \
        "    BFC r7,#0,#16                         \n" /* load next 2 of the 8 */          \
        "    MRC  p0,%[dra],r4,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r5,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r6,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r7,c3,c0,%[machine] \n"                                     \
        "    LSR r4,r4,#16                         \n" /* store first two results */       \
        "    BFI r5,r4,#0,#16                      \n" /* store first two results */       \
        "    STR r5,[r1],#4                        \n" /* store first two results */       \
        "    LDR r9,[r0],#4                        \n" /* load next 2 of the 8 */          \
        "    LSL r8,r9,#16                         \n" /* load next 2 of the 8 */          \
        "    BFC r9,#0,#16                         \n" /* load next 2 of the 8 */          \
        "    MRC  p0,%[dra],r6,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r7,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r8,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r9,c3,c0,%[machine] \n"                                     \
        "    LSR r6,r6,#16                         \n" /* store second two results */      \
        "    BFI r7,r6,#0,#16                      \n" /* store second two results */      \
        "    STR r7,[r1],#4                        \n" /* store second two results */      \
        "    LDR r7,[r0],#4                        \n" /* load next 2 of the 8 */          \
        "    LSL r6,r7,#16                         \n" /* load next 2 of the 8 */          \
        "    BFC r7,#0,#16                         \n" /* load next 2 of the 8 */          \
        "    MRC  p0,%[dra],r8,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r9,c3,c0,#0            \n"                                     \
        "    MCR  p0,%[opcode],r6,c1,c0,%[machine] \n"                                     \
        "    ISB                                   \n"                                     \
        "    MCR  p0,%[opcode],r7,c3,c0,%[machine] \n"                                     \
        "    LSR r8,r8,#16                         \n" /* store third two results */       \
        "    BFI r9,r8,#0,#16                      \n" /* store third two results */       \
        "    STR r9,[r1],#4                        \n" /* store third two results */       \
        "    SUBS r2, r2, #8                       \n" /* length -= 8; if (length != 0) */ \
        "    ITT  NE                               \n"                                     \
        "    LDR r5,[r0],#4                        \n" /* load next 2 of the 8 */          \
        "    LSL r4,r5,#16                         \n" /* load next 2 of the 8 */          \
        "    BFC r5,#0,#16                         \n" /* load next 2 of the 8 */          \
        "    MRC  p0,%[dra],r6,c1,c0,#0            \n"                                     \
        "    MRC  p0,%[dra],r7,c3,c0,#0            \n"                                     \
        "    CMP  r2, #0                           \n" /* if (length == 0) */              \
        "    BNE  loop                             \n"                                     \
        "    LSR r6,r6,#16                         \n" /* store fourth two results */      \
        "    BFI r7,r6,#0,#16                      \n" /* store fourth two results */      \
        "    STR r7,[r1],#4                        \n" /* store fourth two results */      \
        ::[opcode] "i"(BATCH_OPCODE),                                                      \
        [dra] "i"(DOUBLE_READ_ADDERS), [machine] "i"(BATCH_MACHINE))

#define _pq_vector8_biquaddf2_fp()                                                       \
    __asm volatile(                                                                      \
        "loop:                                   \n"                                     \
        "    MCR  p0,#0x1,r4,c0,c0,#6            \n" /* write biquad0*/                  \
        "    CMP  r3, #0                         \n"                                     \
        "    ITE  NE                             \n"                                     \
        "    STRNE r7,[r1],#4                    \n" /* store last result*/              \
        "    MOVEQ r3, #1                        \n" /* middle = 1 */                    \
        "    LDMIA  r0!,{r6-r9}                  \n" /* load next 4 datas */             \
        "    MRC  p0,#0x1,r4,c0,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r5,c0,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r5,c0,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r6,c0,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r6,c0,c0,#0            \n" /* read  biquad0 */                 \
        "    MCR  p0,#0x1,r7,c0,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r7,c0,c0,#0            \n" /* read  biquad0 */                 \
        "    MCR  p0,#0x1,r8,c0,c0,#6            \n" /* write biquad0*/                  \
        "    STMIA    r1!,{r4-r7}                \n" /* store first four results */      \
        "    MRC  p0,#0x1,r8,c0,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r9,c0,c0,#6            \n" /* write biquad0*/                  \
        "    LDRD r6,r7,[r0],#8                  \n" /* load next 2 items*/              \
        "    MRC  p0,#0x1,r9,c0,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r6,c0,c0,#6            \n" /* write biquad0*/                  \
        "    STRD r8,r9,[r1],#8                  \n" /* store third two results */       \
        "    MRC  p0,#0x1,r6,c0,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r7,c0,c0,#6            \n" /* write biquad0*/                  \
        "    SUBS r2, r2, #8                     \n" /* length -= 8; if (length != 0) */ \
        "    IT   NE                             \n"                                     \
        "    LDRDNE r4,r5,[r0],#8                \n" /* load first two of next 8 */      \
        "    STR r6,[r1],#4                      \n" /* store 7th results */             \
        "    MRC  p0,#0x1,r7,c0,c0,#0            \n" /* read  biquad0*/                  \
        "    CMP  r2, #0                         \n" /* if (length == 0) */              \
        "    BNE  loop                           \n"                                     \
        "    STR r7,[r1],#4                      \n" /* store last result */             \
        )

#define _pq_vector8_biquaddf2_fx32()                                                     \
    __asm volatile(                                                                      \
        "loop:                                   \n"                                     \
        "    MCR  p0,#0x1,r4,c1,c0,#6            \n" /* write biquad0*/                  \
        "    CMP  r3, #0                         \n"                                     \
        "    ITE  NE                             \n"                                     \
        "    STRNE r7,[r1],#4                    \n" /* store last result*/              \
        "    MOVEQ r3, #1                        \n" /* middle = 1 */                    \
        "    LDMIA  r0!,{r6-r9}                  \n" /* load next 4 datas */             \
        "    MRC  p0,#0x1,r4,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r5,c1,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r5,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0 */                 \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0 */                 \
        "    MCR  p0,#0x1,r8,c1,c0,#6            \n" /* write biquad0*/                  \
        "    STMIA    r1!,{r4-r7}                \n" /* store first four results */      \
        "    MRC  p0,#0x1,r8,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r9,c1,c0,#6            \n" /* write biquad0*/                  \
        "    LDRD r6,r7,[r0],#8                  \n" /* load next 2 items*/              \
        "    MRC  p0,#0x1,r9,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0*/                  \
        "    STRD r8,r9,[r1],#8                  \n" /* store third two results */       \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0*/                  \
        "    SUBS r2, r2, #8                     \n" /* length -= 8; if (length != 0) */ \
        "    IT   NE                             \n"                                     \
        "    LDRDNE r4,r5,[r0],#8                \n" /* load first two of next 8 */      \
        "    STR r6,[r1],#4                      \n" /* store 7th results */             \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    CMP  r2, #0                         \n" /* if (length == 0) */              \
        "    BNE  loop                           \n"                                     \
        "    STR r7,[r1],#4                      \n" /* store last result */             \
        )

#define _pq_vector8_biquaddf2_fx16()                                                     \
    __asm volatile(                                                                      \
        "loop:                                   \n"                                     \
        "    MCR  p0,#0x1,r4,c1,c0,#6            \n" /* write biquad0*/                  \
        "    CMP  r3, #0                         \n"                                     \
        "    ITE  NE                             \n"                                     \
        "    STRHNE r7,[r1],#2                   \n" /* store last result*/              \
        "    MOVEQ r3, #1                        \n" /* middle = 1 */                    \
        "    LDRSH r6,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    LDRSH r7,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    MRC  p0,#0x1,r4,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r5,c1,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r5,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    LDRSH r8,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    LDRSH r9,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0 */                 \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0 */                 \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0 */                 \
        "    STRH r4,[r1],#2                     \n" /* store first 4 results */         \
        "    STRH r5,[r1],#2                     \n" /* store first 4 results */         \
        "    MCR  p0,#0x1,r8,c1,c0,#6            \n" /* write biquad0*/                  \
        "    STRH r6,[r1],#2                     \n" /* store first 4 results */         \
        "    STRH r7,[r1],#2                     \n" /* store first 4 results */         \
        "    MRC  p0,#0x1,r8,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r9,c1,c0,#6            \n" /* write biquad0*/                  \
        "    LDRSH r6,[r0],#2                    \n" /* load next 1 of the 8*/           \
        "    LDRSH r7,[r0],#2                    \n" /* load next 1 of the 8*/           \
        "    MRC  p0,#0x1,r9,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0*/                  \
        "    STRH r8,[r1],#2                     \n" /* store next two results */        \
        "    STRH r9,[r1],#2                     \n" /* store next two results */        \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0*/                  \
        "    SUBS r2, r2, #8                     \n" /* length -= 8; if (length != 0) */ \
        "    ITT   NE                            \n"                                     \
        "    LDRSHNE r4,[r0],#2                  \n" /* load first two of next 8*/       \
        "    LDRSHNE r5,[r0],#2                  \n" /* load first two of next 8*/       \
        "    STRH r6,[r1],#2                     \n" /* store 7th results */             \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    CMP  r2, #0                         \n" /* if (length == 0) */              \
        "    BNE  loop                           \n"                                     \
        "    STRH r7,[r1],#2                     \n" /* store last result */             \
        )

#define _pq_vector8_biqauddf2cascade_fp()                                                     \
    __asm volatile(                                                                           \
        "loop:                                   \n"                                          \
        "    MCR  p0,#0x1,r4,c2,c0,#2            \n" /* write biquad1*/                       \
        "    CMP  r3, #0                         \n"                                          \
        "    ITTE  NE                            \n"                                          \
        "    MCRNE  p0,#0x1,r7,c0,c0,#2          \n" /* write biquad0*/                       \
        "    MRRCNE p0,#0,r7,r4,c1               \n" /* read both biquad*/                    \
        "    MRCEQ  p0,#0x1,r4,c2,c0,#0          \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r5,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r4,c0,c0,#2            \n" /* write biquad0*/                       \
        "    CMP  r3, #0                         \n"                                          \
        "    ITE  NE                             \n"                                          \
        "    STRDNE r6,r7,[r1],#8                \n" /* store last two results*/              \
        "    MOVEQ r3, #1                        \n" /* middle = 1 */                         \
        "    LDMIA r0!,{r6-r9}                   \n" /* load next 4 datas */                  \
        "    MRRC p0,#0,r4,r5,c1                 \n" /* read both biquad*/                    \
        "    MCR  p0,#0x1,r6,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r5,c0,c0,#2            \n" /* write biquad0*/                       \
        "    MRRC p0,#0,r5,r6,c1                 \n" /* read both biquad*/                    \
        "    MCR  p0,#0x1,r7,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r6,c0,c0,#2            \n" /* write biquad0*/                       \
        "    MRRC p0,#0,r6,r7,c1                 \n" /* read both biquad*/                    \
        "    MCR  p0,#0x1,r8,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r7,c0,c0,#2            \n" /* write biquad0*/                       \
        "    MRRC p0,#0,r7,r8,c1                 \n" /* read both biquad*/                    \
        "    MCR  p0,#0x1,r9,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r8,c0,c0,#2            \n" /* write biquad0*/                       \
        "    STMIA r1!,{R4-R7}                   \n" /* store first and second two results */ \
        "    LDRD r6,r7,[r0],#8                  \n" /* load last 2 of the 8 */               \
        "    MRRC p0,#0,r8,r9,c1                 \n" /* read both biquad*/                    \
        "    MCR  p0,#0x1,r6,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r9,c0,c0,#2            \n" /* write biquad0*/                       \
        "    SUBS r2, r2, #8                     \n" /* length -= 8; if (length != 0) */      \
        "    IT   NE                             \n"                                          \
        "    LDRDNE r4,r5,[r0],#8                \n" /* load first two of next 8 */           \
        "    MRRC p0,#0,r9,r6,c1                 \n" /* read both biquad*/                    \
        "    MCR  p0,#0x1,r7,c2,c0,#2            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r6,c0,c0,#2            \n" /* write biquad0*/                       \
        "    STRD r8,r9,[r1],#8                  \n" /* store third two results */            \
        "    MRRC p0,#0,r6,r7,c1                 \n" /* read both biquad*/                    \
        "    CMP  r2, #0                         \n" /* if (length == 0) */                   \
        "    BNE  loop                           \n"                                          \
        "    MCR  p0,#0x1,r7,c0,c0,#2            \n" /* write biquad0*/                       \
        "    MRC  p0,#0x1,r7,c0,c0,#0            \n" /* read  biquad0*/                       \
        "    STRD r6,r7,[r1],#8                  \n" /* store fourth two results */           \
        )

#define _pq_vector8_biqauddf2cascade_fx32()                                                   \
    __asm volatile(                                                                           \
        "loop:                                   \n"                                          \
        "    MCR  p0,#0x1,r4,c3,c0,#6            \n" /* write biquad1*/                       \
        "    CMP  r3, #0                         \n"                                          \
        "    ITTTE  NE                           \n"                                          \
        "    MCRNE  p0,#0x1,r7,c1,c0,#6          \n" /* write biquad0*/                       \
        "    MRCNE  p0,#0x1,r7,c1,c0,#0          \n" /* read  biquad0*/                       \
        "    MRCNE  p0,#0x1,r4,c3,c0,#0          \n" /* read  biquad1*/                       \
        "    MRCEQ  p0,#0x1,r4,c3,c0,#0          \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r5,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r4,c1,c0,#6            \n" /* write biquad0*/                       \
        "    CMP  r3, #0                         \n"                                          \
        "    ITE  NE                             \n"                                          \
        "    STRDNE r6,r7,[r1],#8                \n" /* store last two results*/              \
        "    MOVEQ r3, #1                        \n" /* middle = 1 */                         \
        "    LDMIA r0!,{r6-r9}                   \n" /* load next 4 datas */                  \
        "    MRC  p0,#0x1,r4,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r5,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r6,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r5,c1,c0,#6            \n" /* write biquad0*/                       \
        "    MRC  p0,#0x1,r5,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r6,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r7,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0*/                       \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r7,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r8,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0*/                       \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r8,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r9,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r8,c1,c0,#6            \n" /* write biquad0*/                       \
        "    STMIA r1!,{R4-R7}                   \n" /* store first and second two results */ \
        "    LDRD r6,r7,[r0],#8                  \n" /* load last 2 of the 8 */               \
        "    MRC  p0,#0x1,r8,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r9,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r6,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r9,c1,c0,#6            \n" /* write biquad0*/                       \
        "    SUBS r2, r2, #8                     \n" /* length -= 8; if (length != 0) */      \
        "    IT   NE                             \n"                                          \
        "    LDRDNE r4,r5,[r0],#8                \n" /* load first two of next 8 */           \
        "    MRC  p0,#0x1,r9,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r6,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    MCR  p0,#0x1,r7,c3,c0,#6            \n" /* write biquad1*/                       \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0*/                       \
        "    STRD r8,r9,[r1],#8                  \n" /* store third two results */            \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    MRC  p0,#0x1,r7,c3,c0,#0            \n" /* read  biquad1*/                       \
        "    CMP  r2, #0                         \n" /* if (length == 0) */                   \
        "    BNE  loop                           \n"                                          \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0*/                       \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0*/                       \
        "    STRD r6,r7,[r1],#8                  \n" /* store fourth two results */           \
        )

#define _pq_vector8_biqauddf2cascade_fx16()                                              \
    __asm volatile(                                                                      \
        "loop:                                   \n"                                     \
        "    MCR  p0,#0x1,r4,c3,c0,#6            \n" /* write biquad1*/                  \
        "    CMP  r3, #0                         \n"                                     \
        "    ITTTE  NE                           \n"                                     \
        "    MCRNE  p0,#0x1,r7,c1,c0,#6          \n" /* write biquad0*/                  \
        "    MRCNE  p0,#0x1,r7,c1,c0,#0          \n" /* read  biquad0*/                  \
        "    MRCNE  p0,#0x1,r4,c3,c0,#0          \n" /* read  biquad1*/                  \
        "    MRCEQ  p0,#0x1,r4,c3,c0,#0          \n" /* read  biquad1*/                  \
        "    MCR  p0,#0x1,r5,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r4,c1,c0,#6            \n" /* write biquad0*/                  \
        "    CMP  r3, #0                         \n"                                     \
        "    ITTE  NE                            \n"                                     \
        "    STRHNE r6,[r1],#2                   \n" /* store last two results*/         \
        "    STRHNE r7,[r1],#2                   \n" /* store last two results*/         \
        "    MOVEQ r3, #1                        \n" /* middle = 1 */                    \
        "    LDRSH r6,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    LDRSH r7,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    MRC  p0,#0x1,r4,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r5,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    MCR  p0,#0x1,r6,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r5,c1,c0,#6            \n" /* write biquad0*/                  \
        "    MRC  p0,#0x1,r5,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r6,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    LDRSH r8,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    LDRSH r9,[r0],#2                    \n" /* load next 2 of the 8*/           \
        "    MCR  p0,#0x1,r7,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0*/                  \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r7,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    STRH r4,[r1],#2                     \n" /* store first 4 results */         \
        "    STRH r5,[r1],#2                     \n" /* store first 4 results */         \
        "    MCR  p0,#0x1,r8,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0*/                  \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r8,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    MCR  p0,#0x1,r9,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r8,c1,c0,#6            \n" /* write biquad0*/                  \
        "    STRH r6,[r1],#2                     \n" /* store first 4 results */         \
        "    STRH r7,[r1],#2                     \n" /* store first 4 results */         \
        "    LDRSH r6,[r0],#2                    \n" /* load last 2 of the 8*/           \
        "    LDRSH r7,[r0],#2                    \n" /* load last 2 of the 8*/           \
        "    MRC  p0,#0x1,r8,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r9,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    MCR  p0,#0x1,r6,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r9,c1,c0,#6            \n" /* write biquad0*/                  \
        "    SUBS r2, r2, #8                     \n" /* length -= 8; if (length != 0) */ \
        "    ITT   NE                            \n"                                     \
        "    LDRSHNE r4,[r0],#2                  \n" /* load first two of next 8*/       \
        "    LDRSHNE r5,[r0],#2                  \n" /* load first two of next 8*/       \
        "    MRC  p0,#0x1,r9,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r6,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    MCR  p0,#0x1,r7,c3,c0,#6            \n" /* write biquad1*/                  \
        "    MCR  p0,#0x1,r6,c1,c0,#6            \n" /* write biquad0*/                  \
        "    STRH r8,[r1],#2                     \n" /* store third two results */       \
        "    STRH r9,[r1],#2                     \n" /* store third two results */       \
        "    MRC  p0,#0x1,r6,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    MRC  p0,#0x1,r7,c3,c0,#0            \n" /* read  biquad1*/                  \
        "    CMP  r2, #0                         \n" /* if (length == 0) */              \
        "    BNE  loop                           \n"                                     \
        "    MCR  p0,#0x1,r7,c1,c0,#6            \n" /* write biquad0*/                  \
        "    MRC  p0,#0x1,r7,c1,c0,#0            \n" /* read  biquad0*/                  \
        "    STRH r6,[r1],#2                     \n" /* store fourth two results */      \
        "    STRH r7,[r1],#2                     \n" /* store fourth two results */      \
        )

/*! @brief Make the length used for matrix functions. */
#define POWERQUAD_MAKE_MATRIX_LEN(mat1Row, mat1Col, mat2Col) \
    (((uint32_t)(mat1Row) << 0U) | \
     ((uint32_t)(mat1Col) << 8U) | \
     ((uint32_t)(mat2Col) << 16U))

/*! @brief powerquad computation engine */
typedef enum
{
    kPQ_CP_PQ = 0,    /*!< Math engine.*/
    kPQ_CP_MTX = 1,   /*!< Matrix engine.*/
    kPQ_CP_FFT = 2,   /*!< FFT engine.*/
    kPQ_CP_FIR = 3,   /*!< FIR engine.*/
    kPQ_CP_CORDIC = 5 /*!< CORDIC engine.*/
} pq_computationengine_t;

/*! @brief powerquad data structure format type */
typedef enum
{
    kPQ_16Bit = 0, /*!< Int16 Fixed point.*/
    kPQ_32Bit = 1, /*!< Int32 Fixed point.*/
    kPQ_Float = 2  /*!< Float point.*/
} pq_format_t;

/*! @brief Coprocessor prescale */
typedef struct
{
    int8_t inputPrescale;  /*!< Input prescale.*/
    int8_t outputPrescale; /*!< Output prescale.*/
    int8_t outputSaturate; /*!< Output saturate at n bits, for example 0x11 is 8 bit space,
                                  the value will be truncated at +127 or -128.*/
} pq_prescale_t;

/*! @brief powerquad data structure format */
typedef struct
{
    pq_format_t inputAFormat;  /*!< Input A format.*/
    int8_t inputAPrescale;    /*!< Input A prescale, for example 1.5 can be 1.5*2^n if you scale by 'shifting'
                                  ('scaling' by a factor of n).*/
    pq_format_t inputBFormat;  /*!< Input B format.*/
    int8_t inputBPrescale;    /*!< Input B prescale.*/
    pq_format_t outputFormat;  /*!< Out format.*/
    int8_t outputPrescale;    /*!< Out prescale.*/
    pq_format_t tmpFormat;     /*!< Temp format.*/
    int8_t tmpPrescale;       /*!< Temp prescale.*/
    pq_format_t machineFormat; /*!< Machine format.*/
    uint32_t *tmpBase;         /*!< Tmp base address.*/
} pq_config_t;

/*! @brief Struct to save biquad parameters. */
typedef struct _pq_biquad_param
{
    float v_n_1; /*!< v[n-1], set to 0 when initialization. */
    float v_n;   /*!< v[n], set to 0 when initialization.  */
    float a_1;   /*!< a[1] */
    float a_2;   /*!< a[2] */
    float b_0;   /*!< b[0] */
    float b_1;   /*!< b[1] */
    float b_2;   /*!< b[2] */
} pq_biquad_param_t;

/*! @brief Struct to save biquad state. */
typedef struct _pq_biquad_state
{
    pq_biquad_param_t param; /*!< Filter parameter. */
    uint32_t compreg;        /*!< Internal register, set to 0 when initialization. */
} pq_biquad_state_t;

/*! @brief Instance structure for the direct form II Biquad cascade filter */
typedef struct
{
    uint8_t numStages;         /**< Number of 2nd order stages in the filter.*/
    pq_biquad_state_t *pState; /**< Points to the array of state coefficients.*/
} pq_biquad_cascade_df2_instance;

/*! @brief CORDIC iteration */
typedef enum
{
    kPQ_Iteration_8 = 0, /*!< Iterate 8 times.*/
    kPQ_Iteration_16,    /*!< Iterate 16 times.*/
    kPQ_Iteration_24     /*!< Iterate 24 times.*/
} pq_cordic_iter_t;

/*******************************************************************************
 * API
 ******************************************************************************/

#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */

/*!
 * @name POWERQUAD functional Operation
 * @{
 */

/*!
 * @brief Get default configuration.
 *
 * This function initializes the POWERQUAD configuration structure to a default value.
 * FORMAT register field definitions
 *   Bits[15:8] scaler (for scaled 'q31' formats)
 *   Bits[5:4] external format. 00b=q15, 01b=q31, 10b=float
 *   Bits[1:0] internal format. 00b=q15, 01b=q31, 10b=float
 *   POWERQUAD->INAFORMAT = (config->inputAPrescale << 8) | (config->inputAFormat << 4) | config->machineFormat
 *
 * For all Powerquad operations internal format must be float (with the only exception being
 * the FFT related functions, ie FFT/IFFT/DCT/IDCT which must be set to q31).
 * The default values are:
 *   config->inputAFormat = kPQ_Float;
 *   config->inputAPrescale = 0;
 *   config->inputBFormat = kPQ_Float;
 *   config->inputBPrescale = 0;
 *   config->outputFormat = kPQ_Float;
 *   config->outputPrescale = 0;
 *   config->tmpFormat = kPQ_Float;
 *   config->tmpPrescale = 0;
 *   config->machineFormat = kPQ_Float;
 *
 * @param config Pointer to "pq_config_t" structure.
 */
void PQ_GetDefaultConfig(pq_config_t *config);

/*!
 * @brief Set configuration with format/prescale.
 *
 * @param base  POWERQUAD peripheral base address
 * @param config Pointer to "pq_config_t" structure.
 */
void PQ_SetConfig(POWERQUAD_Type *base, const pq_config_t *config);

/*!
 * @brief set coprocessor scaler for coprocessor instructions, this function is used to
 * set output saturation and scaleing for input/output.
 *
 * @param base  POWERQUAD peripheral base address
 * @param prescale Pointer to "pq_prescale_t" structure.
 */
void PQ_SetCoprocessorScaler(POWERQUAD_Type *base, const pq_prescale_t *prescale);

/*!
 * @brief Initializes the POWERQUAD module.
 *
 * @param base   POWERQUAD peripheral base address.
 */
void PQ_Init(POWERQUAD_Type *base);

/*!
 * @brief De-initializes the POWERQUAD module.
 *
 * @param base POWERQUAD peripheral base address.
 */
void PQ_Deinit(POWERQUAD_Type *base);

/*!
 * @brief Set format for non-coprecessor instructions.
 *
 * @param base  POWERQUAD peripheral base address
 * @param engine Computation engine
 * @param format Data format
 */
void PQ_SetFormat(POWERQUAD_Type *base, pq_computationengine_t engine, pq_format_t format);

/*!
 * @brief Wait for the completion.
 *
 * @param base  POWERQUAD peripheral base address
 */
void PQ_WaitDone(POWERQUAD_Type *base);

/*!
 * @brief Processing function for the floating-point natural log.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_LnF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_ln0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readAdd0();
}

/*!
 * @brief Processing function for the floating-point reciprocal.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_InvF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_inv0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readMult0();
}

/*!
 * @brief Processing function for the floating-point square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_SqrtF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_sqrt0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readMult0();
}

/*!
 * @brief Processing function for the floating-point inverse square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_InvSqrtF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_invsqrt0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readMult0();
}

/*!
 * @brief Processing function for the floating-point natural exponent.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_EtoxF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_etox0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readMult0();
}

/*!
 * @brief Processing function for the floating-point natural exponent with negative parameter.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_EtonxF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_etonx0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readMult0();
}

/*!
 * @brief Processing function for the floating-point sine.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_SinF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_sin0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readAdd0();
}

/*!
 * @brief Processing function for the floating-point cosine.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_CosF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_cos0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readAdd0();
}

/*!
 * @brief Processing function for the floating-point biquad.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_BiquadF32(float32_t *pSrc, float32_t *pDst)
{
    _pq_biquad0(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readAdd0();
}

/*!
 * @brief Processing function for the floating-point division.
 *
 * Get x1 / x2.
 *
 * @param  x1 x1
 * @param  x2 x2
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_DivF32(float32_t *x1, float32_t *x2, float32_t *pDst)
{
    uint32_t X1 = *(uint32_t *)x1;
    uint32_t X2 = *(uint32_t *)x2;
    uint64_t input = (uint64_t)(X2) | ((uint64_t)(X1) << 32U);

    _pq_div0(input);
    *(q31_t *)pDst = _pq_readMult0();
}

/*!
 * @brief Processing function for the floating-point biquad.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 */
static inline void PQ_Biquad1F32(float32_t *pSrc, float32_t *pDst)
{
    _pq_biquad1(*(q31_t *)pSrc);
    *(q31_t *)pDst = _pq_readAdd1();
}

/*!
 * @brief Processing function for the fixed natural log.
 *
 * @param val value to be calculated
 * @return returns ln(val).
 */
static inline q31_t PQ_LnFixed(q31_t val)
{
    _pq_ln_fx0(val);
    return _pq_readAdd0_fx();
}

/*!
 * @brief Processing function for the fixed reciprocal.
 *
 * @param val value to be calculated
 * @return returns inv(val).
 */
static inline q31_t PQ_InvFixed(q31_t val)
{
    _pq_inv_fx0(val);
    return _pq_readMult0_fx();
}

/*!
 * @brief Processing function for the fixed square-root.
 *
 * @param val value to be calculated
 * @return returns sqrt(val).
 */
static inline uint32_t PQ_SqrtFixed(uint32_t val)
{
    _pq_sqrt_fx0(val);
    return _pq_readMult0_fx();
}

/*!
 * @brief Processing function for the fixed inverse square-root.
 *
 * @param val value to be calculated
 * @return returns 1/sqrt(val).
 */
static inline q31_t PQ_InvSqrtFixed(q31_t val)
{
    _pq_invsqrt_fx0(val);
    return _pq_readMult0_fx();
}

/*!
 * @brief Processing function for the Fixed natural exponent.
 *
 * @param val value to be calculated
 * @return returns etox^(val).
 */
static inline q31_t PQ_EtoxFixed(q31_t val)
{
    _pq_etox_fx0(val);
    return _pq_readMult0_fx();
}

/*!
 * @brief Processing function for the fixed natural exponent with negative parameter.
 *
 * @param val value to be calculated
 * @return returns etonx^(val).
 */
static inline q31_t PQ_EtonxFixed(q31_t val)
{
    _pq_etonx_fx0(val);
    return _pq_readMult0_fx();
}

/*!
 * @brief Processing function for the fixed sine.
 *
 * @param val value to be calculated
 * @return returns sin(val).
 */
static inline q31_t PQ_SinQ31(q31_t x)
{
#if 0
    q31_t ret;
    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    _pq_sin_fx0(val);
    ret = _pq_readAdd0_fx();

    POWERQUAD_NS->CPPRE = cppre;

    return ret;

#endif
    PQFLT val;
    uint32_t cppre; /* keep pre reg. */

    /* ڴԳĹλworkaround */
    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = (31u) << 8u; //POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    val.for_cp = 0x30c90fdb;
    val.x =  (float) x *  val.x;

    _pq_sin0(val.for_cp); // _pq_sin0(val.for_cp);
    val.for_cp = _pq_readAdd0();
    val.for_cp = _pq_readAdd0_fx();

    POWERQUAD_NS->CPPRE = cppre;

    return val.for_cp;






}

/*!
 * @brief Processing function for the fixed sine.
 *
 * @param val value to be calculated
 * @return returns sin(val).
 */
static inline q15_t PQ_SinQ15(q15_t val)
{
    q15_t ret;
    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    /* Don't use 15 here, it is wrong then val is 0x4000 */
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    _pq_sin_fx0(val << 16);
    ret = (_pq_readAdd0_fx()) >> 16;

    POWERQUAD_NS->CPPRE = cppre;

    return ret;
}

/*!
 * @brief Processing function for the fixed cosine.
 *
 * @param val value to be calculated
 * @return returns cos(val).
 */
static inline q31_t PQ_CosQ31(q31_t x)
{
#if 0
    q31_t ret;

    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    _pq_cos_fx0(val);
    ret = _pq_readAdd0_fx();

    POWERQUAD_NS->CPPRE = cppre;

    return ret;
#endif

    PQFLT val;
    uint32_t cppre; /* keep pre reg. */

    /* ڴԳĹλworkaround */
    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = (31u) << 8u; //POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    val.for_cp = 0x30c90fdb;
    val.x =  (float) x *  val.x;

    _pq_cos0(val.for_cp); // _pq_sin0(val.for_cp);
    val.for_cp = _pq_readAdd0();
    val.for_cp = _pq_readAdd0_fx();

    POWERQUAD_NS->CPPRE = cppre;

    return val.for_cp;
}

/*!
 * @brief Processing function for the fixed sine.
 *
 * @param val value to be calculated
 * @return returns sin(val).
 */
static inline q15_t PQ_CosQ15(q15_t val)
{
    q15_t ret;
    uint32_t cppre;

    cppre = POWERQUAD_NS->CPPRE;
    POWERQUAD_NS->CPPRE = POWERQUAD_CPPRE_CPPRE_OUT(31);

    _pq_cos_fx0(val << 16);
    ret = (_pq_readAdd0_fx()) >> 16;

    POWERQUAD_NS->CPPRE = cppre;

    return ret;
}

/*!
 * @brief Processing function for the fixed biquad.
 *
 * @param val value to be calculated
 * @return returns biquad(val).
 */
static inline q31_t PQ_BiquadFixed(q31_t val)
{
    _pq_biquad0_fx(val);
    return _pq_readAdd0_fx();
}

/*!
 * @brief Processing function for the floating-point vectorised natural log.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorLnF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised reciprocal.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorInvF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorSqrtF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised inverse square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorInvSqrtF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised natural exponent.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorEtoxF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised natural exponent with negative parameter.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorEtonxF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised sine
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorSinF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised cosine.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorCosF32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised natural log.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorLnQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised reciprocal.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorInvQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorSqrtQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised inverse square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorInvSqrtQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised natural exponent.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorEtoxQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised natural exponent with negative parameter.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorEtonxQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised sine
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorSinQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised cosine.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorCosQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised sine
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorSinQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised cosine.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorCosQ31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised natural log.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorLnQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised reciprocal.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorInvQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorSqrtQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised inverse square-root.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorInvSqrtQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised natural exponent.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorEtoxQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised natural exponent with negative parameter.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length     the block of input data.
 */
void PQ_VectorEtonxQ15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised biquad direct form II.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  length the block size of input data.
 */
void PQ_VectorBiqaudDf2F32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised biquad direct form II.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  blocksSize the block size of input data
 */
void PQ_VectorBiqaudDf2Q31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised biquad direct form II.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  blocksSize the block size of input data
 */
void PQ_VectorBiqaudDf2Q15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the floating-point vectorised biquad direct form II.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  blocksSize the block size of input data
 */
void PQ_VectorBiqaudCascadeDf2F32(float32_t *pSrc, float32_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q31 vectorised biquad direct form II.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  blocksSize the block size of input data
 */
void PQ_VectorBiqaudCascadeDf2Q31(q31_t *pSrc, q31_t *pDst, q31_t length);

/*!
 * @brief Processing function for the Q15 vectorised biquad direct form II.
 *
 * @param  *pSrc      points to the block of input data
 * @param  *pDst      points to the block of output data
 * @param  blocksSize the block size of input data
 */
void PQ_VectorBiqaudCascadeDf2Q15(q15_t *pSrc, q15_t *pDst, q31_t length);

/*!
 * @brief Processing function for the fixed inverse trigonometric.
 *
 * @param base  POWERQUAD peripheral base address
 * @param x value of opposite
 * @param y value of adjacent
 * @param iteration iteration times
 * @return The return value is in the range of -2^27 to 2^27, which means -pi to pi.
 * @note The sum of x and y should not exceed the range of int32_t.
 * @note Larger input number gets higher output accuracy, for example the arctan(0.5),
 * the result of PQ_ArctanFixed(POWERQUAD, 100000, 200000, kPQ_Iteration_24) is more
 * accurate than PQ_ArctanFixed(POWERQUAD, 1, 2, kPQ_Iteration_24).
 */
q31_t PQ_ArctanFixed(POWERQUAD_Type *base, q31_t x, q31_t y, pq_cordic_iter_t iteration);

/*!
 * @brief Processing function for the fixed inverse trigonometric.
 *
 * @param base  POWERQUAD peripheral base address
 * @param x value of opposite
 * @param y value of adjacent
 * @param iteration iteration times
 * @return The return value is in the range of -2^27 to 2^27, which means -1 to 1.
 * @note The sum of x and y should not exceed the range of int32_t.
 * @note Larger input number gets higher output accuracy, for example the arctanh(0.5),
 * the result of PQ_ArctanhFixed(POWERQUAD, 100000, 200000, kPQ_Iteration_24) is more
 * accurate than PQ_ArctanhFixed(POWERQUAD, 1, 2, kPQ_Iteration_24).
 */
q31_t PQ_ArctanhFixed(POWERQUAD_Type *base, q31_t x, q31_t y, pq_cordic_iter_t iteration);

/*!
 * @brief Processing function for the fixed biquad.
 *
 * @param val value to be calculated
 * @return returns biquad(val).
 */
static inline q31_t PQ_Biquad1Fixed(q31_t val)
{
    _pq_biquad1_fx(val);
    return _pq_readAdd1_fx();
}

/*!
 * @brief Processing function for the complex FFT.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length number of input samples
 * @param pData input data
 * @param pResult output data.
 */
void PQ_TransformCFFT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for the real FFT.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length number of input samples
 * @param pData input data
 * @param pResult output data.
 */
void PQ_TransformRFFT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for the inverse complex FFT.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length number of input samples
 * @param pData input data
 * @param pResult output data.
 */
void PQ_TransformIFFT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for the complex DCT.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length number of input samples
 * @param pData input data
 * @param pResult output data.
 */
void PQ_TransformCDCT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for the real DCT.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length number of input samples
 * @param pData input data
 * @param pResult output data.
 */
void PQ_TransformRDCT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for the inverse complex DCT.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length number of input samples
 * @param pData input data
 * @param pResult output data.
 */
void PQ_TransformIDCT(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for backup biquad context.
 *
 * @param base  POWERQUAD peripheral base address
 * @param biquad_num biquad side
 * @param state point to states.
 */
void PQ_BiquadBackUpInternalState(POWERQUAD_Type *base, int32_t biquad_num, pq_biquad_state_t *state);

/*!
 * @brief Processing function for restore biquad context.
 *
 * @param base  POWERQUAD peripheral base address
 * @param biquad_num biquad side
 * @param state point to states.
 */
void PQ_BiquadRestoreInternalState(POWERQUAD_Type *base, int32_t biquad_num, pq_biquad_state_t *state);

/*!
 * @brief  Initialization function for the direct form II Biquad cascade filter.
 *
 * @param[in,out] *S           points to an instance of the filter data structure.
 * @param[in]     numStages    number of 2nd order stages in the filter.
 * @param[in]     *pState      points to the state buffer.
 */
void PQ_BiquadCascadeDf2Init(pq_biquad_cascade_df2_instance *S, uint8_t numStages, pq_biquad_state_t *pState);

/*!
 * @brief Processing function for the floating-point direct form II Biquad cascade filter.
 *
 * @param[in]  *S        points to an instance of the filter data structure.
 * @param[in]  *pSrc     points to the block of input data.
 * @param[out] *pDst     points to the block of output data
 * @param[in]  blockSize number of samples to process.
 */
void PQ_BiquadCascadeDf2F32(const pq_biquad_cascade_df2_instance *S,
                            float32_t *pSrc,
                            float32_t *pDst,
                            uint32_t blockSize);

/*!
 * @brief Processing function for the Q31 direct form II Biquad cascade filter.
 *
 * @param[in]  *S        points to an instance of the filter data structure.
 * @param[in]  *pSrc     points to the block of input data.
 * @param[out] *pDst     points to the block of output data
 * @param[in]  blockSize number of samples to process.
 */
void PQ_BiquadCascadeDf2Q31(const pq_biquad_cascade_df2_instance *S, q31_t *pSrc, q31_t *pDst, uint32_t blockSize);

/*!
 * @brief Processing function for the Q15 direct form II Biquad cascade filter.
 *
 * @param[in]  *S        points to an instance of the filter data structure.
 * @param[in]  *pSrc     points to the block of input data.
 * @param[out] *pDst     points to the block of output data
 * @param[in]  blockSize number of samples to process.
 */
void PQ_BiquadCascadeDf2Q15(const pq_biquad_cascade_df2_instance *S, q15_t *pSrc, q15_t *pDst, uint32_t blockSize);

/*!
 * @brief Processing function for the FIR.
 *
 * @param base  POWERQUAD peripheral base address
 * @param pAData the first input sequence
 * @param ALength number of the first input sequence
 * @param pBData the second input sequence
 * @param BLength number of the second input sequence
 * @param pResult array for the output data
 * @param opType operation type, could be PQ_FIR_FIR, PQ_FIR_CONVOLUTION, PQ_FIR_CORRELATION.
 */
void PQ_FIR(
    POWERQUAD_Type *base, void *pAData, int32_t ALength, void *pBData, int32_t BLength, void *pResult, uint32_t opType);

/*!
 * @brief Processing function for the incremental FIR.
 *        This function can be used after pq_fir() for incremental FIR
 *        operation when new x data are available
 *
 * @param base  POWERQUAD peripheral base address
 * @param ALength number of input samples
 * @param BLength number of taps
 * @param xoffset offset for number of input samples
 * @param opType FIR operation type, could be PQ_FIR_FIR.
 */
void PQ_FIRIncrement(POWERQUAD_Type *base, int32_t ALength, int32_t BLength, int32_t xOffset, uint32_t opType);

/*!
 * @brief Processing function for the matrix addition.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param pAData input matrix A
 * @param pBData input matrix B
 * @param pResult array for the output data.
 */
void PQ_MatrixAddition(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult);

/*!
 * @brief Processing function for the matrix subtraction.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param pAData input matrix A
 * @param pBData input matrix B
 * @param pResult array for the output data.
 */
void PQ_MatrixSubtraction(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult);

/*!
 * @brief Processing function for the matrix multiplication.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param pAData input matrix A
 * @param pBData input matrix B
 * @param pResult array for the output data.
 */
void PQ_MatrixMultiplication(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult);

/*!
 * @brief Processing function for the matrix product.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param pAData input matrix A
 * @param pBData input matrix B
 * @param pResult array for the output data.
 */
void PQ_MatrixProduct(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult);

/*!
 * @brief Processing function for the vector dot product.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length length of vector
 * @param pAData input vector A
 * @param pBData input vector B
 * @param pResult array for the output data.
 */
void PQ_VectorDotProduct(POWERQUAD_Type *base, uint32_t length, void *pAData, void *pBData, void *pResult);

/*!
 * @brief Processing function for the matrix inverse.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param pData input matrix
 * @param pTmpData input temporary matrix, pTmpData length not less than pData lenght and 1024 words is sufficient for
 * the largest supported matrix.
 * @param pResult array for the output data, round down for fixed point.
 */
void PQ_MatrixInversion(POWERQUAD_Type *base, uint32_t length, void *pData, void *pTmpData, void *pResult);

/*!
 * @brief Processing function for the matrix transpose.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param pData input matrix
 * @param pResult array for the output data.
 */
void PQ_MatrixTranspose(POWERQUAD_Type *base, uint32_t length, void *pData, void *pResult);

/*!
 * @brief Processing function for the matrix scale.
 *
 * @param base  POWERQUAD peripheral base address
 * @param length rows and cols for matrix. LENGTH register configuration:
 *        LENGTH[23:16] = M2 cols
 *        LENGTH[15:8]  = M1 cols
 *        LENGTH[7:0]   = M1 rows
 *        This could be constructed using macro @ref POWERQUAD_MAKE_MATRIX_LEN.
 * @param misc scaling parameters
 * @param pData input matrix
 * @param pResult array for the output data.
 */
void PQ_MatrixScale(POWERQUAD_Type *base, uint32_t length, float misc, void *pData, void *pResult);

/* @} */

#if defined(__cplusplus)
}

#endif /* __cplusplus */

/*! @}*/

#endif /* _FSL_POWERQUAD_H_ */
