The following program which is used for matrix Multiplication 4 by 4 , when I don't use the optimization (-o2) ,the result is right, when I use optimization (-o2) , the result is wrong. The program is from the Optimizing Loops on the C66x DSP, I have displace the _cmatmpyr1 with _cmatmpy and shifting to meet our need. All the input and output is ten quantitative(Q10) signed complex 16-bit (16-bit real/16-bit Imaginary).
#include "c6x.h"
#define MATRIXSIZE 4
void MATRIX_Mult4by4(int M, int *restrict input_mat1, int *restrict input_mat2, int *restrict output_mat)
{
_nassert((int) input_mat1 % 8 == 0);
_nassert((int) input_mat2 % 8 == 0);
_nassert((int) output_mat % 8 == 0);
long long * input_vec1;
long long * input_vec2;
long long * input_vec3;
long long * input_vec4;
long long * inputMatPtr1;
long long * inputMatPtr2;
long long * inputMatPtr3;
long long * inputMatPtr4;
long long llinputV1;
long long llinputV2;
long long llinputV3;
long long llinputV4;
long long llinputM1;
long long llinputM2;
__x128_t inputMat;
__x128_t acc1_128;
__x128_t acc2_128;
__x128_t acc3_128;
__x128_t acc4_128;
__x128_t acc5_128;
__x128_t acc6_128;
__x128_t acc7_128;
__x128_t acc8_128;
int* acc1_32;
int* acc2_32;
int* acc3_32;
int* acc4_32;
int* acc5_32;
int* acc6_32;
int* acc7_32;
int* acc8_32;
long long acc1;
long long acc2;
long long acc3;
long long acc4;
long long acc5;
long long acc6;
long long acc7;
long long acc8;
int mm;
#pragma MUST_ITERATE(NUMOFPOINT,NUMOFPOINT,2)
for ( mm = 0; mm < M; mm++ ) //M 4x4 matrix multiplications in the loop
{
input_vec1 =(long long *) &input_mat1[mm * MATRIXSIZE * MATRIXSIZE + 0 * MATRIXSIZE];
input_vec2 =(long long *) &input_mat1[mm * MATRIXSIZE * MATRIXSIZE + 1 * MATRIXSIZE];
input_vec3 =(long long *) &input_mat1[mm * MATRIXSIZE * MATRIXSIZE + 2 * MATRIXSIZE];
input_vec4 =(long long *) &input_mat1[mm * MATRIXSIZE * MATRIXSIZE + 3 * MATRIXSIZE];
inputMatPtr1=(long long*) &input_mat2[mm * MATRIXSIZE * MATRIXSIZE + 0 * MATRIXSIZE];
inputMatPtr2=(long long*) &input_mat2[mm * MATRIXSIZE * MATRIXSIZE + 1 * MATRIXSIZE];
inputMatPtr3=(long long*) &input_mat2[mm * MATRIXSIZE * MATRIXSIZE + 2 * MATRIXSIZE];
inputMatPtr4=(long long*) &input_mat2[mm * MATRIXSIZE * MATRIXSIZE + 3 * MATRIXSIZE];
/* (4x4) X (4x4) */
llinputV1 =_amem8(input_vec1++ );
llinputV2 =_amem8(input_vec2++ );
llinputV3 =_amem8(input_vec3++ );
llinputV4 =_amem8(input_vec4++ );
llinputM1 =_amem8(inputMatPtr1++ );
llinputM2 =_amem8(inputMatPtr2++ );
// #ifdef _LITTLE_ENDIAN
inputMat = _llto128(llinputM2, llinputM1);
// #else
// inputMat = _llto128(llinputM1, llinputM2);
// #endif
acc1_128 = _cmatmpy(llinputV1, inputMat);
acc2_128 = _cmatmpy(llinputV2, inputMat);
acc3_128 = _cmatmpy(llinputV3, inputMat);
acc4_128 = _cmatmpy(llinputV4, inputMat);
acc1_32 = (int *)&acc1_128;
acc2_32 = (int *)&acc2_128;
acc3_32 = (int *)&acc3_128;
acc4_32 = (int *)&acc4_128;
acc1 = _itoll(_pack2(_amem4(acc1_32++)>>10,_amem4(acc1_32++)>>10),_pack2(_amem4(acc1_32++)>>10,_amem4(acc1_32++)>>10));
acc2 = _itoll(_pack2(_amem4(acc2_32++)>>10,_amem4(acc2_32++)>>10),_pack2(_amem4(acc2_32++)>>10,_amem4(acc2_32++)>>10));
acc3 = _itoll(_pack2(_amem4(acc3_32++)>>10,_amem4(acc3_32++)>>10),_pack2(_amem4(acc3_32++)>>10,_amem4(acc3_32++)>>10));
acc4 = _itoll(_pack2(_amem4(acc4_32++)>>10,_amem4(acc4_32++)>>10),_pack2(_amem4(acc4_32++)>>10,_amem4(acc4_32++)>>10));
llinputM1 = _amem8(inputMatPtr1++ );
llinputM2 = _amem8(inputMatPtr2++ );
// #ifdef _LITTLE_ENDIAN
inputMat = _llto128(llinputM2, llinputM1);
// #else
// inputMat = _llto128(llinputM1, llinputM2);
// #endif
acc5_128 = _cmatmpy(llinputV1, inputMat);
acc6_128 = _cmatmpy(llinputV2, inputMat);
acc7_128 = _cmatmpy(llinputV3, inputMat);
acc8_128 = _cmatmpy(llinputV4, inputMat);
acc5_32 = (int *)&acc5_128;
acc6_32 = (int *)&acc6_128;
acc7_32 = (int *)&acc7_128;
acc8_32 = (int *)&acc8_128;
acc5 = _itoll(_pack2(_amem4(acc5_32++)>>10,_amem4(acc5_32++)>>10),_pack2(_amem4(acc5_32++)>>10,_amem4(acc5_32++)>>10));
acc6 = _itoll(_pack2(_amem4(acc6_32++)>>10,_amem4(acc6_32++)>>10),_pack2(_amem4(acc6_32++)>>10,_amem4(acc6_32++)>>10));
acc7 = _itoll(_pack2(_amem4(acc7_32++)>>10,_amem4(acc7_32++)>>10),_pack2(_amem4(acc7_32++)>>10,_amem4(acc7_32++)>>10));
acc8 = _itoll(_pack2(_amem4(acc8_32++)>>10,_amem4(acc8_32++)>>10),_pack2(_amem4(acc8_32++)>>10,_amem4(acc8_32++)>>10));
llinputV1 =_amem8(input_vec1++ );
llinputV2 =_amem8(input_vec2++ );
llinputV3 =_amem8(input_vec3++ );
llinputV4 =_amem8(input_vec4++ );
llinputM1=_amem8(inputMatPtr3++ );
llinputM2=_amem8(inputMatPtr4++ );
// #ifdef _LITTLE_ENDIAN
inputMat = _llto128(llinputM2, llinputM1);
// #else
// inputMat = _llto128(llinputM1, llinputM2);
// #endif
// acc1 = _dadd2(_cmatmpyr1(llinputV1, inputMat), acc1);
// acc2 = _dadd2(_cmatmpyr1(llinputV2, inputMat), acc2);
// acc3 = _dadd2(_cmatmpyr1(llinputV3, inputMat), acc3);
// acc4 = _dadd2(_cmatmpyr1(llinputV4, inputMat), acc4);
acc1_128 = _cmatmpy(llinputV1, inputMat);
acc2_128 = _cmatmpy(llinputV2, inputMat);
acc3_128 = _cmatmpy(llinputV3, inputMat);
acc4_128 = _cmatmpy(llinputV4, inputMat);
acc1_32 = (int *)&acc1_128;
acc2_32 = (int *)&acc2_128;
acc3_32 = (int *)&acc3_128;
acc4_32 = (int *)&acc4_128;
acc1 = _dadd2( _itoll(_pack2(_amem4(acc1_32++)>>10,_amem4(acc1_32++)>>10),_pack2(_amem4(acc1_32++)>>10,_amem4(acc1_32++)>>10)),acc1);
acc2 = _dadd2( _itoll(_pack2(_amem4(acc2_32++)>>10,_amem4(acc2_32++)>>10),_pack2(_amem4(acc2_32++)>>10,_amem4(acc2_32++)>>10)),acc2);
acc3 = _dadd2( _itoll(_pack2(_amem4(acc3_32++)>>10,_amem4(acc3_32++)>>10),_pack2(_amem4(acc3_32++)>>10,_amem4(acc3_32++)>>10)),acc3);
acc4 = _dadd2( _itoll(_pack2(_amem4(acc4_32++)>>10,_amem4(acc4_32++)>>10),_pack2(_amem4(acc4_32++)>>10,_amem4(acc4_32++)>>10)),acc4);
llinputM1=_amem8(inputMatPtr3++ );
llinputM2=_amem8(inputMatPtr4++ );
// #ifdef _LITTLE_ENDIAN
inputMat = _llto128(llinputM2, llinputM1);
// #else
// inputMat = _llto128(llinputM1, llinputM2);
// #endif
// acc5 = _dadd2(_cmatmpyr1(llinputV1, inputMat), acc5);
// acc6 = _dadd2(_cmatmpyr1(llinputV2, inputMat), acc6);
// acc7 = _dadd2(_cmatmpyr1(llinputV3, inputMat), acc7);
// acc8 = _dadd2(_cmatmpyr1(llinputV4, inputMat), acc8);
acc5_128 = _cmatmpy(llinputV1, inputMat);
acc6_128 = _cmatmpy(llinputV2, inputMat);
acc7_128 = _cmatmpy(llinputV3, inputMat);
acc8_128 = _cmatmpy(llinputV4, inputMat);
acc5_32 = (int *)&acc5_128;
acc6_32 = (int *)&acc6_128;
acc7_32 = (int *)&acc7_128;
acc8_32 = (int *)&acc8_128;
acc5 = _dadd2( _itoll(_pack2(_amem4(acc5_32++)>>10,_amem4(acc5_32++)>>10),_pack2(_amem4(acc5_32++)>>10,_amem4(acc5_32++)>>10)),acc5);
acc6 = _dadd2( _itoll(_pack2(_amem4(acc6_32++)>>10,_amem4(acc6_32++)>>10),_pack2(_amem4(acc6_32++)>>10,_amem4(acc6_32++)>>10)),acc6);
acc7 = _dadd2( _itoll(_pack2(_amem4(acc7_32++)>>10,_amem4(acc7_32++)>>10),_pack2(_amem4(acc7_32++)>>10,_amem4(acc7_32++)>>10)),acc7);
acc8 = _dadd2( _itoll(_pack2(_amem4(acc8_32++)>>10,_amem4(acc8_32++)>>10),_pack2(_amem4(acc8_32++)>>10,_amem4(acc8_32++)>>10)),acc8);
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 0 * MATRIXSIZE + 0]) = acc1;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 0 * MATRIXSIZE + 2]) = acc5;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 1 * MATRIXSIZE + 0]) = acc2;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 1 * MATRIXSIZE + 2]) = acc6;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 2 * MATRIXSIZE + 0]) = acc3;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 2 * MATRIXSIZE + 2]) = acc7;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 3 * MATRIXSIZE + 0]) = acc4;
_amem8(&output_mat[mm * MATRIXSIZE * MATRIXSIZE + 3 * MATRIXSIZE + 2]) = acc8;
}
}