打印

转:ARM NEON 优化举例

[复制链接]
3042|1
手机看帖
扫描二维码
随时随地手机跟帖
跳转到指定楼层
楼主
粉色壁纸|  楼主 | 2012-1-28 18:35 | 只看该作者 回帖奖励 |倒序浏览 |阅读模式
ARM, neo, ge, ui, AN
ARM NEON 优化举例

累加数组内起始元素到制定个数元素的累加和:
uint32_t vector_add_of_n(uint32_t *ptr, uint32_t nitems)
{
    uint32_t result, *i;
    uint32x2_t vec64a, vec64b;
    uint32x4_t vec128 = vdupq_n_u32(0);
    for (i = ptr; i < (ptr + nitems); i += 4)
    {
        uint32x4_t temp128 = vld1q_u32(i);
        vec128 = vaddq_u32(vec128, temp128);
    }
    vec64a = vget_low_u32(vec128);
    vec64b = vget_high_u32(vec128);
    vec64a = vadd_u32 (vec64a, vec64b);
    result =  vget_lane_u32(vec64a, 0);
    result += vget_lane_u32(vec64a, 1);
    return result;
}



两个数组相关映射元素的乘积和:
void  fir(short * y,const short *x, const short *h,int n_out, int n_coefs)
{
int n, k;
int sum;
int16x4_t h_vec;
int16x4_t x_vec;
int32x4_t result_vec;
for (n = 0; n < n_out; n++)
{
sum = 0;
result_vec = vdupq_n_s32(0);
for(k = 0; k < n_coefs / 4; k++)
{
h_vec = vld1_s16(&h[k*4]);
x_vec = vld1_s16(&x[n - n_coefs + 1 + k*4]);
result_vec = vmlal_s16(result_vec, h_vec, x_vec);
}
sum += vgetq_lane_s32(result_vec, 0);
sum += vgetq_lane_s32(result_vec, 1);
sum += vgetq_lane_s32(result_vec, 2);
sum += vgetq_lane_s32(result_vec, 3);
if(n_coefs % 4)
{
for(k = n_coefs - (n_coefs % 4); k < n_coefs; k++)
sum += h[k] * x[n - n_coefs + 1 + k];
}
y[n] = ((sum>>15) + 1) >> 1;
}

相关帖子

沙发
粉色壁纸|  楼主 | 2012-1-28 18:36 | 只看该作者
彩色转灰度:(C)
void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
  int i;
  for (i=0; i<n; i++)
  {
    int r = *src++;
    int g = *src++;
    int b = *src++;  

     int y = (r*77)+(g*151)+(b*28);
    *dest++ = (y>>8);
  }
}





void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
  int i;
  uint8x8_t **c = vdup_n_u8 (77);
  uint8x8_t gfac = vdup_n_u8 (151);
  uint8x8_t bfac = vdup_n_u8 (28);
  n/=8;

  for (i=0; i<n; i++)
  {
    uint16x8_t  temp;
    uint8x8x3_t rgb  = vld3_u8 (src);
    uint8x8_t result;

    temp = vmull_u8 (rgb.val[0],      **c);
    temp = vmlal_u8 (temp,rgb.val[1], gfac);
    temp = vmlal_u8 (temp,rgb.val[2], bfac);

    result = vshrn_n_u16 (temp, 8);
    vst1_u8 (dest, result);
    src  += 8*3;
    dest += 8;
  }
}

使用特权

评论回复
发新帖 我要提问
您需要登录后才可以回帖 登录 | 注册

本版积分规则

0

主题

121

帖子

1

粉丝