使用SIMD的move来实现资源平衡
C66x的SIMD move指令能在寄存器间转移数据,需要注意的是如_itoll, _ftod _ito128, _fto128, _dto128 和 _llto128在C66x上对应于非SIMD的move指令。使用这些intrinsic会导致循环对
.L, .S 和 .D单元的约束。因而可以考虑使用SIMD的move指令来代替这些intrinsics,如SIMD intrinsics _dmv(整型) _fdmv(双精度)。对于何时用SIMD的move指令,有如下参考建议:
使用SIMD move如果你需要赋值寄存器到寄存器对;
使用SIMD move如果你确定这些寄存器不会在接下来的指令中使用。
需要注意的是,SIMD的move会增加循环的动态长度。
尽可能避免通用的相同表达式,尤其对于__x128_t类型
对于TMS320C66x编译器而言,那些结果是__x128_t类型的表达式并不会归为相同的表达式,因而可能会重复计算。所以在使用__x128_t数据类型的intrinsic时,尽量吧相同的部分提取出来。如下所示。这种改变不会改变循环的功能功能,但是却能改变性能
void dprod_vcse(double *restrict inputPtr,double *restrict coefsPtr,int
nCoefs,double *restrict sumPtr, int nlength) {
int i, j;
double sumTemp = 0, sumTemp1 = 0, sumTemp2 = 0, sumTemp3 = 0;
for(i = 0; i<nlength/4; i++)
{
for (j = 0; j < nCoefs; j++)
{
sumTemp = _daddsp(sumTemp,_daddsp(_hid128(_cmpysp(inputPtr<span style="font-style: normal;">,coefsPtr</span><span style="font-style: normal;">)),_lod128(_cmpysp(inputPtr</span><span style="font-style: normal;">,coefsPtr</span><span style="font-style: normal;">))));
sumTemp1 = _daddsp(sumTemp1,_daddsp(_hid128(_cmpysp(inputPtr[i+1],coefsPtr</span><span style="font-style: normal;">)),_lod128(_cmpysp(inputPtr[i+1],coefsPtr</span><span style="font-style: normal;">))));
sumTemp2 = _daddsp(sumTemp2,_daddsp(_hid128(_cmpysp(inputPtr[i+2],coefsPtr</span><span style="font-style: normal;">)),_lod128(_cmpysp(inputPtr[i+2],coefsPtr</span><span style="font-style: normal;">))));
sumTemp3 = _daddsp(sumTemp3,_daddsp(_hid128(_cmpysp(inputPtr[i+3],coefsPtr</span><span style="font-style: normal;">)),_lod128(_cmpysp(inputPtr[i+3],coefsPtr</span><span style="font-style: normal;">))));
}
sumPtr</span><span style="font-style: normal;"> = sumTemp;
sumPtr[i+1] = sumTemp1;
sumPtr[i+2] = sumTemp2;
sumPtr[i+3] = sumTemp3;
}
}</span>
修改为
<span style="font-style: normal;">void dprod_novcse(double *restrict inputPtr,double *restrict coefsPtr,int
nCoefs,double *restrict sumPtr, int nlength) {
int i, j;
double sumTemp = 0, sumTemp1 = 0, sumTemp2 = 0, sumTemp3 = 0;
__x128_t cmpysp_temp, cmpysp_temp1, cmpysp_temp2, cmpysp_temp3;
for(i = 0; i<nlength/4; i++)
{
for (j = 0; j < nCoefs; j++)
{
cmpysp_temp = _cmpysp(inputPtr</span><span style="font-style: normal;">,coefsPtr</span><span style="font-style: normal;">);
sumTemp = _daddsp(sumTemp, _daddsp(_hid128(cmpysp_temp),
_lod128(cmpysp_temp)));
cmpysp_temp1 = _cmpysp(inputPtr[i+1],coefsPtr</span><span style="font-style: normal;">);
sumTemp1 = _daddsp(sumTemp1, _daddsp(_hid128(cmpysp_temp1),
_lod128(cmpysp_temp1)));
cmpysp_temp2 = _cmpysp(inputPtr[i+2],coefsPtr</span><span style="font-style: normal;">);
sumTemp2 = _daddsp(sumTemp2, _daddsp(_hid128(cmpysp_temp2),
_lod128(cmpysp_temp2)));
cmpysp_temp3 = _cmpysp(inputPtr[i+3],coefsPtr</span><span style="font-style: normal;">);
sumTemp3 = _daddsp(sumTemp3, _daddsp(_hid128(cmpysp_temp3),
_lod128(cmpysp_temp3)));
}
sumPtr</span><span style="font-style: normal;"> = sumTemp;
sumPtr[i+1] = sumTemp1;
sumPtr[i+2] = sumTemp2;</span>
sumPtr[i+3] = sumTemp3;
}
} |