/*
 * Copyright 2019-2021 NXP
 * All rights reserved.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

/* FreeRTOS kernel includes. */
#include "FreeRTOS.h"
#include "task.h"

#include "fsl_debug_console.h"
#include "board.h"
#include "vglite_support.h"
#include "vglite_window.h"
#include "tiger_paths.h"
/*-----------------------------------------------------------*/
#include "vg_lite.h"

#include "pin_mux.h"
#include "fsl_gpio.h"
#include "clock_config.h"
#include "display_support.h"

/*-----------------------------------------------------------*/
#include "fsl_common.h"
#include "fsl_mu.h"
#include "fsl_usart.h"
#include "arm_math.h"
#include "dsp_support.h"
#include "fsl_power.h"
#include "fsl_ctimer.h"
/*******************************************************************************
 * Definitions
 ******************************************************************************/
#define APP_BUFFER_COUNT 2
#define DEFAULT_SIZE     256.0f;

#define APP_MU MUA
/* Flag indicates Core Boot Up*/
#define BOOT_FLAG 0x01U

/* Channel transmit and receive register */
#define CHN_MU_REG_NUM 0U

/* How many message is used to test message sending */
#define MSG_LENGTH 32U

#define FLOAT_2_Q31(x) ((int32_t)((x)*2147483648.0f))
#define VEC_ADD_LENGTH 200
#define VEC_DOT_LENGTH 16
#define MTX_TRANS_LENGHT 4
#define LOOP_COUNT     10000

#define DEMO_USART              USART0
#define DEMO_USART_CLK_SRC      kCLOCK_Flexcomm0Clk
#define DEMO_USART_CLK_FREQ     CLOCK_GetFlexcommClkFreq(0U)
#define DEMO_USART_IRQHandler   FLEXCOMM0_IRQHandler
#define DEMO_USART_IRQn         FLEXCOMM0_IRQn

#define CTIMER          CTIMER2         /* Timer 2 */
#define CTIMER_MAT0_OUT kCTIMER_Match_0 /* Match output 0 */
#define CTIMER_CLK_FREQ CLOCK_GetCtimerClkFreq(2)
#define CYCLES_PER_COUNT  ((BOARD_BOOTCLOCKRUN_CORE_CLOCK)*(.000005))  //.0000005 = 990 cycles every 5 micro seconds

/*******************************************************************************
 * Prototypes
 ******************************************************************************/
static void vglite_task(void *pvParameters);
static void cpu_test();
void LED_INIT();
void CTIMER_INIT();
void ctimer_match0_callback(uint32_t flags);
static void arm_mat_sqrt_Test();
static void arm_mat_sine_Test();
static void arm_mat_vec_add_Test();
static void arm_mat_vec_dot_Test();
static void arm_mat_mtx_inv_Test();
static void arm_mat_mtx_tnsp_Test();

/*******************************************************************************
 * Variables
 ******************************************************************************/
static vg_lite_display_t display;
static vg_lite_window_t window;

static int zoomOut    = 0;
static int scaleCount = 0;
static vg_lite_matrix_t matrix;

#if (CUSTOM_VGLITE_MEMORY_CONFIG != 1)
#error "Application must be compiled with CUSTOM_VGLITE_MEMORY_CONFIG=1"
#else
#define VGLITE_COMMAND_BUFFER_SZ (128 * 1024)
#if (720 * 1280 == (DEMO_PANEL_WIDTH) * (DEMO_PANEL_HEIGHT))
/* Tesselation window = 720 x 640 */
#define TW             720
#define TH             640
#define VGLITE_HEAP_SZ 3955776 /* 3.8 MB */
#elif (400 * 400 == (DEMO_PANEL_WIDTH) * (DEMO_PANEL_HEIGHT))
/* Tesselation window = 400 x 400 */
#define TW             400
#define TH             400
#define VGLITE_HEAP_SZ 1544704 /* 1.5 MB */
#else
/* Tesselation window = 256 x 256 */
#define TW             256
#define TH             256
#define VGLITE_HEAP_SZ 787456 /* 0.76 MB */
#endif
/* Allocate the heap and set the command buffer(s) size */
AT_NONCACHEABLE_SECTION_ALIGN(uint8_t vglite_heap[VGLITE_HEAP_SZ], 64);

void *vglite_heap_base        = &vglite_heap;
uint32_t vglite_heap_size     = VGLITE_HEAP_SZ;
uint32_t vglite_cmd_buff_size = VGLITE_COMMAND_BUFFER_SZ;
#endif

static uint32_t countUseconds = 0;/* Array of function pointers for callback for each channel */
ctimer_callback_t ctimer_callback_table[] = {
    ctimer_match0_callback, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
/* Match Configuration for Channel 0 */
static ctimer_match_config_t matchConfig0;
ctimer_config_t ctimerConfig;

static volatile bool uartTyped = false;
static volatile uint8_t dataTyped = 0;

q31_t sqrtResult = 0;
const q31_t sqrtRef    = FLOAT_2_Q31(0.5f);
q31_t sinResult  = 0;
const q31_t sinRef     = FLOAT_2_Q31(0.5f);

const float32_t vec_dot_a[VEC_DOT_LENGTH] = {1.01, 2.02,  3.03,  4.04,  5.05,  6.06,  7.07,  8.08, 9.09, 10.10, 11.11, 12.12, 13.13, 14.14, 15.15, 16.16};
const float32_t vec_dot_b[VEC_DOT_LENGTH] = {-1.01, 2.02,  -3.03,  4.04,  -5.05,  6.06,  -7.07,  8.08, -9.09, 10.10, -11.11, 12.12, -13.13, 14.14, -15.15, 16.16};

float32_t M1[4]            = {0.0, 0.0, 0.0, 0.0};
float32_t inverseResult[4] = {0.0, 0.0, 0.0, 0.0};
float32_t inverseRef[4]    = {0.0, 0.0, 0.0, 0.0};

static q31_t vec_add_x[VEC_ADD_LENGTH];
static q31_t vec_add_y[VEC_ADD_LENGTH];
static q31_t vec_add_out[VEC_ADD_LENGTH];
static q31_t vec_add_out_ref[VEC_ADD_LENGTH];

static float32_t transpose[MTX_TRANS_LENGHT*MTX_TRANS_LENGHT];
static float32_t transposeResult[MTX_TRANS_LENGHT*MTX_TRANS_LENGHT];
static float32_t transposeRel[MTX_TRANS_LENGHT*MTX_TRANS_LENGHT];

/*******************************************************************************
 * Code
 ******************************************************************************/
#if (DEMO_PANEL_RM67162 == DEMO_PANEL)
void GPIO_INTA_IRQHandler(void)
{
    uint32_t intStat;

    intStat = GPIO_PortGetInterruptStatus(GPIO, BOARD_MIPI_TE_PORT, 0);

    GPIO_PortClearInterruptFlags(GPIO, BOARD_MIPI_TE_PORT, 0, intStat);

    if (intStat & (1U << BOARD_MIPI_TE_PIN))
    {
        BOARD_DisplayTEPinHandler();
    }
}
#endif

void DEMO_USART_IRQHandler(void)
{
        /* If new data arrived. */
    if ((kUSART_RxFifoNotEmptyFlag | kUSART_RxError) & USART_GetStatusFlags(DEMO_USART))
    {
        dataTyped = USART_ReadByte(DEMO_USART);
        uartTyped = true;
    }
/* Add for ARM errata 838869, affects Cortex-M4, Cortex-M4F Store immediate overlapping
  exception return operation might vector to incorrect interrupt */
#if defined __CORTEX_M && (__CORTEX_M == 4U)
    __DSB();
#endif
}

void ctimer_match0_callback(uint32_t flags)
{
    countUseconds++;
}

void delay()
{
  volatile uint32_t i = 0;
  for (i = 0; i < 5000000; ++i)
  {
      __NOP();
  }

}

int main(void)
{
	/* Init board hardware. */
	status_t status;
	usart_config_t config;

	BOARD_InitPins();
	BOARD_InitUARTPins();
	BOARD_InitPsRamPins();

#if (DEMO_PANEL_TFT_PROTO_5 == DEMO_PANEL)
	BOARD_InitFlexIOPanelPins();

	GPIO_PortInit(GPIO, BOARD_SSD1963_RST_PORT);
	GPIO_PortInit(GPIO, BOARD_SSD1963_CS_PORT);
	GPIO_PortInit(GPIO, BOARD_SSD1963_RS_PORT);
#else
	BOARD_InitMipiPanelPins();

	GPIO_PortInit(GPIO, BOARD_MIPI_POWER_PORT);
	GPIO_PortInit(GPIO, BOARD_MIPI_BL_PORT);
	GPIO_PortInit(GPIO, BOARD_MIPI_RST_PORT);

#if (DEMO_PANEL_RM67162 == DEMO_PANEL)
	GPIO_PortInit(GPIO, BOARD_MIPI_TE_PORT);
#endif

#endif

	BOARD_BootClockRUN();
	BOARD_InitDebugConsole();

	status = BOARD_InitPsRam();
	if (status != kStatus_Success)
	{
		assert(false);
	}

	/* Initialize CTIMER */
	CTIMER_INIT();
	/* Initialize LED */
	LED_INIT();
	/* Clear MUA reset */
	RESET_PeripheralReset(kMU_RST_SHIFT_RSTn);
	/* MUA init */
	MU_Init(APP_MU);

	/* Copy DSP image to RAM and start DSP core. */
	BOARD_DSP_Init();
	/* Wait DSP core is Boot Up */
	while (BOOT_FLAG != MU_GetFlags(APP_MU));

	/* Enable Rx and Tx on UART */
	USART_GetDefaultConfig(&config);
	config.baudRate_Bps = BOARD_DEBUG_UART_BAUDRATE;
	config.enableTx     = true;
	config.enableRx     = true;

	USART_Init(DEMO_USART, &config, DEMO_USART_CLK_FREQ);
	/* Enable RX interrupt. */
	USART_EnableInterrupts(DEMO_USART, kUSART_RxLevelInterruptEnable | kUSART_RxErrorInterruptEnable);
	EnableIRQ(DEMO_USART_IRQn);

	/* Execute CPU test - CM33 vs FusionF1 */
	cpu_test();

	/* Erase to beginning of screen */
	PRINTF("\033[1J");
	/* Place cursor at top */
	PRINTF("\033[f");
	PRINTF("This Graphics Demo provides an image of a Tiger Head in vector format that is shown on the rectangular display in continuous rotation in the range [0-360] and continuous scaling in the range [x-y]\r\n\r\n");

	if (xTaskCreate(vglite_task, "vglite_task", configMINIMAL_STACK_SIZE + 200, NULL, configMAX_PRIORITIES - 1, NULL) !=
		pdPASS)
	{
		PRINTF("Task creation failed!.\r\n");
		while (1)
			;
	}

	vTaskStartScheduler();
	for (;;);

}

static void cpu_test()
{
   uint8_t graphicTest = 0;

   /* Erase to beginning of screen */
   PRINTF("\033[1J");
   /* Place cursor at top */
   PRINTF("\033[f");

   PRINTF("Type a number between 1 - 6 to select a function and execute it on CM33 and FusionF1.\r\nType 7 to change to Graphic demo\r\n");
   PRINTF("1.SQRT\r\n2.SINE\r\n3.VECTOR ADD\r\n4.VECTOR DOT\r\n5.INV MATRIX\r\n6.MATRIX TRANSPOSE\r\n7.GRAPHIC DEMO\r\n\r\n");

    while(!graphicTest)
    {
      if(uartTyped)
        {
	  /* Erase to beginning of screen */
          PRINTF("\033[1J");
          /* Place cursor at top */
          PRINTF("\033[f");
          PRINTF("Type a number between 1 - 6 to select a function and execute it on CM33 and FusionF1.\r\nType 7 to change to Graphic demo\r\n");
          PRINTF("1.SQRT\r\n2.SINE\r\n3.VECTOR ADD\r\n4.VECTOR DOT\r\n5.INV MATRIX\r\n6.MATRIX TRANSPOSE\r\n7.GRAPHIC DEMO\r\n\r\n");

          switch(dataTyped)
          {
          case '1':
			/* Execute square root */
			arm_mat_sqrt_Test();
			/* Communicate with FusionF1 to execute math function */
			MU_SendMsg(APP_MU, CHN_MU_REG_NUM, 1);
			dataTyped = 0;
			break;
		  case '2':
			/* Execute sine */
			arm_mat_sine_Test();
			/* Communicate with FusionF1 to execute math function */
			MU_SendMsg(APP_MU, CHN_MU_REG_NUM, 2);
			dataTyped = 0;
			break;
		  case '3':
			/* Execute vector add */
			arm_mat_vec_add_Test();
			/* Communicate with FusionF1 to execute math function */
			MU_SendMsg(APP_MU, CHN_MU_REG_NUM, 3);
			dataTyped = 0;
			break;
		  case '4':
			/* Execute vector dot product */
			arm_mat_vec_dot_Test();
			/* Communicate with FusionF1 to execute math function */
			MU_SendMsg(APP_MU, CHN_MU_REG_NUM, 4);
			dataTyped = 0;
			break;
		  case '5':
			/* Execute inverse matrix */
			arm_mat_mtx_inv_Test();
			/* Communicate with FusionF1 to execute math function */
			MU_SendMsg(APP_MU, CHN_MU_REG_NUM, 5);
			dataTyped = 0;
			break;
          case '6':
            /* Execute matrix transpose */
            arm_mat_mtx_tnsp_Test();
            /* Communicate with FusionF1 to execute math function */
            MU_SendMsg(APP_MU, CHN_MU_REG_NUM, 6);
            dataTyped = 0;
            break;
          case '7':
            /* Change to graphic test */
            graphicTest = 1;
            dataTyped = 0;
            break;

          default:
        	  break;
        }
      uartTyped = false;
    }
    delay();
    /* Toggle led */
    LED_RED_TOGGLE();
  }

  BOARD_DSP_DeInit();
}

void LED_INIT()
{
    /* Init port clock's */
    CLOCK_EnableClock(kCLOCK_HsGpio0);
    CLOCK_EnableClock(kCLOCK_HsGpio1);
    CLOCK_EnableClock(kCLOCK_HsGpio3);

    RESET_PeripheralReset(kHSGPIO0_RST_SHIFT_RSTn);
    RESET_PeripheralReset(kHSGPIO1_RST_SHIFT_RSTn);
    RESET_PeripheralReset(kHSGPIO3_RST_SHIFT_RSTn);

    /* Init output LED GPIO. */
    LED_RED_INIT(LOGIC_LED_OFF);
    LED_BLUE_INIT(LOGIC_LED_OFF);
    LED_GREEN_INIT(LOGIC_LED_OFF);
}

void CTIMER_INIT()
{
    /* CTIMER Initialization */
    CLOCK_AttachClk(kMAIN_CLK_to_CTIMER2);
    CTIMER_GetDefaultConfig(&ctimerConfig);
    ctimerConfig.prescale = 1;
    CTIMER_Init(CTIMER, &ctimerConfig);

    matchConfig0.enableCounterReset = true;
    matchConfig0.enableCounterStop  = false;
    matchConfig0.matchValue         = 990;
    matchConfig0.outControl         = kCTIMER_Output_NoAction;
    matchConfig0.outPinInitState    = false;
    matchConfig0.enableInterrupt    = true;

    CTIMER_RegisterCallBack(CTIMER, &ctimer_callback_table[0], kCTIMER_SingleCallback);
    CTIMER_SetupMatch(CTIMER, CTIMER_MAT0_OUT, &matchConfig0);
}

static void cleanup(void)
{
    uint8_t i;
    for (i = 0; i < pathCount; i++)
    {
        vg_lite_clear_path(&path[i]);
    }

    vg_lite_close();
}

static vg_lite_error_t init_vg_lite(void)
{
    vg_lite_error_t error = VG_LITE_SUCCESS;
    int fb_width, fb_height;

    error = VGLITE_CreateDisplay(&display);
    if (error)
    {
        PRINTF("VGLITE_CreateDisplay failed: VGLITE_CreateDisplay() returned error %d\n", error);
        return error;
    }
    // Initialize the window.
    error = VGLITE_CreateWindow(&display, &window);
    if (error)
    {
        PRINTF("VGLITE_CreateWindow failed: VGLITE_CreateWindow() returned error %d\n", error);
        return error;
    }
    // Initialize the draw.
    error = vg_lite_init(TW, TH);
    if (error)
    {
        PRINTF("vg_lite engine init failed: vg_lite_init() returned error %d\n", error);
        cleanup();
        return error;
    }

    // Setup a scale at center of buffer.
    fb_width  = window.width;
    fb_height = window.height;
    vg_lite_identity(&matrix);
    vg_lite_translate(fb_width / 2 - 20 * fb_width / 640.0f, fb_height / 2 - 100 * fb_height / 480.0f, &matrix);
    vg_lite_scale(4, 4, &matrix);
    vg_lite_scale(fb_width / 640.0f, fb_height / 480.0f, &matrix);

    return error;
}

void animateTiger()
{
    if (zoomOut)
    {
        vg_lite_scale(1.25, 1.25, &matrix);
        if (0 == --scaleCount)
            zoomOut = 0;
    }
    else
    {
        vg_lite_scale(0.8, 0.8, &matrix);
        if (5 == ++scaleCount)
            zoomOut = 1;
    }

    vg_lite_rotate(5, &matrix);
}

static void redraw()
{
    vg_lite_error_t error = VG_LITE_SUCCESS;
    uint8_t count;
    vg_lite_buffer_t *rt = VGLITE_GetRenderTarget(&window);
    if (rt == NULL)
    {
        PRINTF("vg_lite_get_renderTarget error\r\n");
        while (1)
            ;
    }

    // Draw the path using the matrix.
    vg_lite_clear(rt, NULL, 0xFFFFFFFF);
    for (count = 0; count < pathCount; count++)
    {
        error = vg_lite_draw(rt, &path[count], VG_LITE_FILL_EVEN_ODD, &matrix, VG_LITE_BLEND_NONE, color_data[count]);
        if (error)
        {
            PRINTF("vg_lite_draw() returned error %d\n", error);
            cleanup();
            return;
        }
    }

    VGLITE_SwapBuffers(&window);

    animateTiger();

    return;
}

uint32_t getTime()
{
    return (uint32_t)(xTaskGetTickCount() * portTICK_PERIOD_MS);
}

static void vglite_task(void *pvParameters)
{
    status_t status;
    vg_lite_error_t error;
    uint32_t startTime, time, n = 0, fps_x_1000;

    status = BOARD_PrepareVGLiteController();
    if (status != kStatus_Success)
    {
        PRINTF("Prepare VGlite controller error\r\n");
        while (1)
            ;
    }

    error = init_vg_lite();
    if (error)
    {
        PRINTF("init_vg_lite failed: init_vg_lite() returned error %d\n", error);
        while (1)
            ;
    }

    startTime = getTime();
    while (1)
    {
        redraw();
        n++;
        if (n >= 60)
        {
            time       = getTime() - startTime;
            fps_x_1000 = (n * 1000 * 1000) / time;
            PRINTF("%d frames in %d mSec: %d.%d FPS\r\n", n, time, fps_x_1000 / 1000, fps_x_1000 % 1000);
            n         = 0;
            startTime = getTime();
        }
    }
}

static void arm_mat_sqrt_Test()
{
  uint32_t i;
  uint32_t cycles;
  PRINTF("----------------------------------\r\n");
  PRINTF("        SQRT FUNCTION\r\n\r\n");

  q31_t input      = FLOAT_2_Q31(0.25f);

  /* Reset count variable */
  countUseconds = 0;
  /* Start ctimer */
  CTIMER_StartTimer(CTIMER);

  /* Execute math function */
  for(i = 0; i < LOOP_COUNT; i++)
  {
    arm_sqrt_q31(input,&sqrtResult);
  }

  /* Stop ctimer */
  CTIMER_StopTimer(CTIMER);

  /* Verify the result */
  if(abs(sqrtRef - sqrtResult) > 2)
    PRINTF("ERROR on SQRT\r\n");

  /* Convert ctimer counts to cycles */
  cycles = (uint32_t)((countUseconds*CYCLES_PER_COUNT)/LOOP_COUNT);

  PRINTF("CM33 SQRT takes %d cycles\r\n\r\n", cycles);
}

static void arm_mat_sine_Test()
{
  uint32_t i;
  uint32_t cycles;
  PRINTF("----------------------------------\r\n");
  PRINTF("         SINE FUNCTION\r\n\r\n");

  q31_t input      = FLOAT_2_Q31(0.5f / 6.0f);

  /* Reset count variable */
  countUseconds = 0;
  /* Start ctimer */
  CTIMER_StartTimer(CTIMER);

  /* Execute math function */
  for(i = 0; i < LOOP_COUNT; i++)
  {
    sinResult = arm_sin_q31(input);
  }

  /* Stop ctimer */
  CTIMER_StopTimer(CTIMER);

  /* Verify the result */
  if((sinRef - sinResult) > 20000)
    PRINTF("ERROR on SINE\r\n");

  /* Convert ctimer counts to cycles */
  cycles = (uint32_t)((countUseconds*CYCLES_PER_COUNT)/LOOP_COUNT);

  PRINTF("CM33 SINE takes %d cycles\r\n\r\n", cycles);
}

static void arm_mat_vec_add_Test()
{
  uint32_t i;
  uint32_t cycles;

  PRINTF("----------------------------------\r\n");
  PRINTF("       VECTOR ADD FUNCTION\r\n\r\n");

  /* Initialise vector values */
  for(i=0;i<VEC_ADD_LENGTH;i++)
  {
    vec_add_x[i] = i;
    vec_add_y[i] = i;
    vec_add_out_ref[i] = i+i;
  }

  arm_matrix_instance_q31 vecA;
  arm_matrix_instance_q31 vecB;
  arm_matrix_instance_q31 vecR;
  /* Initialise Vector Instance vecA with numRows, numCols and data array(vec_add_x) */
  arm_mat_init_q31(&vecA, 1, VEC_ADD_LENGTH, vec_add_x);
  /* Initialise Vector Instance vecB with numRows, numCols and data array(vec_add_y) */
  arm_mat_init_q31(&vecB, 1, VEC_ADD_LENGTH, vec_add_y);
  /* Initialise Vector Instance vecR with numRows, numCols and data array(vec_add_out) */
  arm_mat_init_q31(&vecR, 1, VEC_ADD_LENGTH, vec_add_out);

  /* Reset count variable */
  countUseconds = 0;
  /* Start ctimer */
  CTIMER_StartTimer(CTIMER);

  /* Execute math function */
  for(i = 0; i < LOOP_COUNT; i++)
  {
    arm_mat_add_q31(&vecA, &vecB, &vecR);
  }

  /* Stop ctimer */
  CTIMER_StopTimer(CTIMER);

  /* Verify the result */
  for (i = 0; i < VEC_ADD_LENGTH; i++)
  {
      if (vec_add_out[i] != vec_add_out_ref[i])
      {
          PRINTF("ERROR on vector add\r\n");
      }
  }

  /* Convert ctimer counts to cycles */
  cycles = (uint32_t)((countUseconds*CYCLES_PER_COUNT)/LOOP_COUNT);

  PRINTF("CM33 VECTOR ADD takes %d cycles\r\n\r\n", cycles);
}

static void arm_mat_vec_dot_Test()
{
  uint32_t i;
  uint32_t cycles;

  PRINTF("----------------------------------\r\n");
  PRINTF("    VECTOR DOT PRODUCT FUNCTION\r\n\r\n");

  float32_t vec_dot_out;
  float32_t vec_dot_out_ref = 138.733643;

  /* Reset count variable */
  countUseconds = 0;
  /* Start ctimer */
  CTIMER_StartTimer(CTIMER);

  /* Execute math function */
  for(i = 0; i < LOOP_COUNT; i++)
  {
    arm_dot_prod_f32(vec_dot_a, vec_dot_b, VEC_DOT_LENGTH, &vec_dot_out);
  }

  /* Stop ctimer */
  CTIMER_StopTimer(CTIMER);

  /* Verify the result */
  if((vec_dot_out - vec_dot_out_ref) > 0.001)
    PRINTF("ERROR on dot vector\r\n");

  /* Convert ctimer counts to cycles */
  cycles = (uint32_t)((countUseconds*CYCLES_PER_COUNT)/LOOP_COUNT);

  PRINTF("CM33 VECTOR DOT PRODUCT takes %d cycles\r\n\r\n", cycles);
}

static void arm_mat_mtx_inv_Test()
{
  uint32_t i;
  uint32_t cycles;
  PRINTF("----------------------------------\r\n");
  PRINTF("     INVERSE MATRIX FUNCTION\r\n\r\n");

  arm_matrix_instance_f32 inverseMatrix;
  arm_matrix_instance_f32 inverseMatrixR;

  /* inv(I) = I */
  for (i = 0; i < 2; i++)
  {
      M1[i * 2 + i]         = 1.0f;
      inverseRef[i * 2 + i] = 1.0f;
  }

  /* Initialise Matrix Instance inverseMatrix with numRows, numCols and data array(M1) */
  arm_mat_init_f32(&inverseMatrix, 2, 2, M1);
  /* Initialise Matrix Instance inverseMatrixR with numRows, numCols and data array(inverseResult) */
  arm_mat_init_f32(&inverseMatrixR, 2, 2, inverseResult);

  /* Reset count variable */
  countUseconds = 0;
  /* Start ctimer */
  CTIMER_StartTimer(CTIMER);

  /* Execute math function */
  for(i = 0; i < LOOP_COUNT; i++)
  {
    arm_mat_inverse_f32(&inverseMatrix,&inverseMatrixR);
  }

  /* Stop ctimer */
  CTIMER_StopTimer(CTIMER);

  /* Verify the result */
  for(i = 0; i < 4; i++)
  {
      if(inverseResult[i] != inverseRef[i])
      {
          PRINTF("ERROR on inverse matrix\r\n");
      }
  }

  /* Convert ctimer counts to cycles */
  cycles = (uint32_t)((countUseconds*CYCLES_PER_COUNT)/LOOP_COUNT);

  PRINTF("CM33 INVERSE MATRIX takes %d cycles\r\n\r\n", cycles);
}

static void arm_mat_mtx_tnsp_Test()
{
  uint32_t i;
  uint32_t cycles;

  PRINTF("----------------------------------\r\n");
  PRINTF("     TANSPOSE MATRIX FUNCTION\r\n\r\n");

  arm_matrix_instance_f32 transMatrix;
  arm_matrix_instance_f32 transMatrixR;

  /* Initialise matrix values */
  for (i = 0; i < MTX_TRANS_LENGHT; i++)
  {
      transpose[i]                 = 1.0f;
      transposeRel[MTX_TRANS_LENGHT * i] = 1.0f;
  }

  /* Initialise Matrix Instance transMatrix with numRows, numCols and data array(transpose) */
  arm_mat_init_f32(&transMatrix, MTX_TRANS_LENGHT, MTX_TRANS_LENGHT, transpose);
  /* Initialise Matrix Instance transMatrixR with numRows, numCols and data array(transposeResult) */
  arm_mat_init_f32(&transMatrixR, MTX_TRANS_LENGHT, MTX_TRANS_LENGHT, transposeResult);

  /* Reset count variable */
  countUseconds = 0;
  /* Start ctimer */
  CTIMER_StartTimer(CTIMER);

  /* Execute math function */
  for(i = 0; i < LOOP_COUNT; i++)
  {
    arm_mat_trans_f32(&transMatrix, &transMatrixR);
  }

  /* Stop ctimer */
  CTIMER_StopTimer(CTIMER);

  /* Verify the result */
  for(i = 0; i < MTX_TRANS_LENGHT*MTX_TRANS_LENGHT; i++)
  {
      if(transMatrixR.pData[i] != transposeRel[i])
      {
          PRINTF("ERROR on transpose matrix\r\n");
      }
  }

  /* Convert ctimer counts to cycles */
  cycles = (uint32_t)((countUseconds*CYCLES_PER_COUNT)/LOOP_COUNT);

  PRINTF("CM33 TANSPOSE MATRIX takes %d cycles\r\n\r\n", cycles);
}

void arm_mat_init_f32(arm_matrix_instance_f32 *S, uint16_t nRows, uint16_t nColumns, float32_t *pData)
{
    /* Assign Number of Rows */
    S->numRows = nRows;

    /* Assign Number of Columns */
    S->numCols = nColumns;

    /* Assign Data pointer */
    S->pData = pData;
}

void arm_mat_init_q31(arm_matrix_instance_q31 *S, uint16_t nRows, uint16_t nColumns, q31_t *pData)
{
    /* Assign Number of Rows */
    S->numRows = nRows;

    /* Assign Number of Columns */
    S->numCols = nColumns;

    /* Assign Data pointer */
    S->pData = pData;
}
