ARM NEON 优化举例
累加数组内起始元素到制定个数元素的累加和:
uint32_t vector_add_of_n(uint32_t *ptr, uint32_t nitems)
{
uint32_t result, *i;
uint32x2_t vec64a, vec64b;
uint32x4_t vec128 = vdupq_n_u32(0);
for (i = ptr; i < (ptr + nitems); i += 4)
{
uint32x4_t temp128 = vld1q_u32(i);
vec128 = vaddq_u32(vec128, temp128);
}
vec64a = vget_low_u32(vec128);
vec64b = vget_high_u32(vec128);
vec64a = vadd_u32 (vec64a, vec64b);
result = vget_lane_u32(vec64a, 0);
result += vget_lane_u32(vec64a, 1);
return result;
}
两个数组相关映射元素的乘积和:
void fir(short * y,const short *x, const short *h,int n_out, int n_coefs)
{
int n, k;
int sum;
int16x4_t h_vec;
int16x4_t x_vec;
int32x4_t result_vec;
for (n = 0; n < n_out; n++)
{
sum = 0;
result_vec = vdupq_n_s32(0);
for(k = 0; k < n_coefs / 4; k++)
{
h_vec = vld1_s16(&h[k*4]);
x_vec = vld1_s16(&x[n - n_coefs + 1 + k*4]);
result_vec = vmlal_s16(result_vec, h_vec, x_vec);
}
sum += vgetq_lane_s32(result_vec, 0);
sum += vgetq_lane_s32(result_vec, 1);
sum += vgetq_lane_s32(result_vec, 2);
sum += vgetq_lane_s32(result_vec, 3);
if(n_coefs % 4)
{
for(k = n_coefs - (n_coefs % 4); k < n_coefs; k++)
sum += h[k] * x[n - n_coefs + 1 + k];
}
y[n] = ((sum>>15) + 1) >> 1;
} |