#/***************************************************************************************/
#/* FILE NAME: fir_Signed16_10tap.s       COPYRIGHT (c) Freescale 2015   */
#/*                                                      All Rights Reserved  						 */
#/* DESCRIPTION: 10 Tap FIR filter in LSP assembly    					             */
#/*                                                                                             				 */
#/***************************************************************************************/	
#/* REV      AUTHOR        DATE        DESCRIPTION OF CHANGE 			 */
#/* ---   -----------    ----------    ---------------------                                     		     */
#/* 1.0	  A Turner   			April 2015   Initial Public Release        		     */
#/***************************************************************************************/
.globl fir_Signed16
.section .vle_imem  , axv
.vle


# N - number of output samples
# x - input array of size N+ntaps-1
# y - output array of size N
# hr - array of coefficients 

# void fir_Signed_c(unsigned short N,  short *x, short *y, short *h);
#fir_Signed16                       (N,  SDADC1_RESULTS, LSP_FIR_OUT, hr1);
.align 16
fir_Signed16:
#<# register definition
.equ N, r3
.equ x, r4
.equ y, r5
.equ h, r6


#co-efficients
.equ h0, r10
.equ h1, r11
.equ h2, r12
.equ h3, r13
.equ h4, r14
.equ h5, r15
.equ h6, r16
.equ h7, r17
.equ h8, r18
.equ h9, r19

#Data
.equ x0, r20
.equ x1, r21
.equ x2, r22
.equ x3, r23
.equ x4, r24
.equ x5, r25
.equ x6, r26
.equ x7, r27
.equ x8, r28
.equ x9, r29
.equ x10,r30

.equ temp, r8
.equ temp1,r9
.equ cnt, r7

#># 

#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------- SAVING CONTEXT -------------------------------------------------------------------------------------------------------------------------------------
# store nonvolatile registers
e_stwu      r1, -28(r1)                # stwu - store with update - r1 contains stack pointer. In this case local DMEM
# store r25 to r31 onto stack
e_stmw      r25, 0(r1)                # stmw - store multiple word - r 14 ro r31 are volatile and as such must be saved and restored if used in the routine
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#Initialize counter to zero
e_li cnt,0          						# clear counter

#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------ STAGE 1 : LOADING THE CO-EFFICIENTS --------------------------------------------------------------------------------------------------
##load the co-efficients


zlhhsplat h0,0(h)                 # CO-EFFICIENT1      # zlhhsplat =  Vector Load halfword into halfwords and splat   rD,d(rA) 
zlhhsplatu h1, 2(h)              # CO-EFFICIENT2      # zlhhsplatu =  Vector Load halfword into halfwords and splat [with update]  rD,d(rA)   
zlhhsplatu h2, 2(h)				 # CO-EFFICIENT3      # In both of the zlhhsplat[u] instructions the halfword addressed by rA is "splatted" accross the destination word rD
zlhhsplatu h3, 2(h)              # CO-EFFICIENT4      #such that rD  = rArA. In the case of the zlhhsplatu instruction the update immediate "u" means that rA is updated to 
zlhhsplatu h4, 2(h)				 # CO-EFFICIENT5      #the effective address + 2 bytes prior to the "splat"execution.
zlhhsplatu h5, 2(h)				 # CO-EFFICIENT6
zlhhsplatu h6, 2(h)              # CO-EFFICIENT7
zlhhsplatu h7, 2(h)              # CO-EFFICIENT8
zlhhsplatu h8, 2(h)              # CO-EFFICIENT9
zlhhsplatu h9, 2(h)              # CO-EFFICIENT10

#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------          :LOADING THE INITIAL DATA--------------------------------------------------------------------------------------------------------
se_subi     y, 4					    # decrement y pointer - This is because zstwhedu instruction in the output loop has an immediate update
                                            # so the pointer must be pre decremented by 1 word prior to reaching loop
#Load the X-data
zldd x0, 0(x)						    #Loads two 32-bit registers (x0 and x1) with the 16-bit results of the first 4 ADC conversions
zlddu x2, 8(x)						#Loads two 32-bit registers (x2 and x3) with the 16-bit results of the next 4 ADC conversions
zlwwu x4, 8(x)						#Loads one 32-bit register  (x4) with the 16-bit results of the next 2 ADC conversions

#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------ MAIN LOOP ------------------------------------------------------------------------------------------------------------------------------------------------

loop:

zlwwu x5, 4(x)						         # load in the first new data  - this is not multiplied yet but is merged in during stage 4

#------------------------------------------------ STAGE 2 : MAC THE ODD COEFFICIENTS----------------------------------------------------------------------------------------------------
# multiply and accumulate using odd coefficients
zvmhulsf	  temp, h9, x0  				# InputVector_0 * Coefficients 10
zvmhulsfaas temp, h7, x1 			    # InputVector_1 * Coefficients 8
zvmhulsfaas temp, h5, x2				# InputVector_2 * Coefficients 6
zvmhulsfaas temp, h3, x3				# InputVector_3 * Coefficients 4
zvmhulsfaas temp, h1, x4				# InputVector_4 * Coefficients 2

#------------------------------------------------ STAGE 3 : MERGE THE INPUT VECTORS----------------------------------------------------------------------------------------------------
#------------------- merge to rotate input data vectors so that the even coefficients multiply the corresponding delayed data ------------------------------------------
zvmergelohih x0,x0,x1					# InputVector0 = Merge InputVector_0 & InputVector_1
zvmergelohih x1,x1,x2					# InputVector1 = Merge InputVector_1 & InputVector_2
zvmergelohih x2,x2,x3					# InputVector2 = Merge InputVector_2 & InputVector_3
zvmergelohih x3,x3,x4					# InputVector3 = Merge InputVector_3 & InputVector_4
zvmergelohih x4,x4,x5                    # InputVector4 = Merge InputVector_4 & InputVector_5  --------- "New" vector comes in here

#------------------------------------------------ STAGE 4 : MAC THE EVEN COEFFICIENTS--------------------------------------------------------------------------------------------------
# multiply and accumulate using even coeffs
zvmhulsfaas temp, h8, x0  				# InputVector_0 * Coefficients 9
zvmhulsfaas temp, h6, x1 				# InputVector_1 * Coefficients 7
zvmhulsfaas temp, h4, x2				# InputVector_2 * Coefficients 5
zvmhulsfaas temp, h2, x3				# InputVector_3 * Coefficients 3
zvmhulsfaas temp, h0, x4				# InputVector_4 * Coefficients 1

#------------------------------------------------ STAGE 5 : STORE OUTPUT TO ARRAY--------------------------------------------------------------------------------------------------------

zstwhedu temp,4(y)						# Update y address pointer by 4 immediately and then store two 16 bit results 

#------------------------------------------------ STAGE 6 : MERGE THE INPUT VECTORS-----------------------------------------------------------------------------------------------------
#merge to rotate input data vectors
zvmergelohih x0,x0,x1						# InputVector0 = Merge InputVector_0 & InputVector_1
zvmergelohih x1,x1,x2						# InputVector1 = Merge InputVector_1 & InputVector_2
zvmergelohih x2,x2,x3						# InputVector2 = Merge InputVector_2 & InputVector_3
zvmergelohih x3,x3,x4						# InputVector3 = Merge InputVector_3 & InputVector_4
zvmergehiloh x4,x5,x5                        # InputVector4 = InputVector_5                             -------------"New" vector comes in here

#------------------------------------------------ STAGE 7 : LOOP END TEST------------------------------------------------------------------------------------------------------------------------
e_addi    cnt, cnt, 2
cmpw cnt, N
e_bne loop

#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------- STAGE 8 RESTORE CONTEXT -------------------------------------------------------------------------------------------------------------------
# restore non-volatile regs
e_lmw       r25, 0(r1)
# delete stack frame 
e_addi      r1, r1, 28          

se_blr
