elas源码赏析(二)sobel算子3*3行列分解快速卷积

  void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
    int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
    int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );    
    detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
    detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
    detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
    _mm_free( temp_h );
    _mm_free( temp_v );
  }

_mm_malloc()

_mm_malloc()主要是为了内存对齐,运行速度快,这里以16位对齐
详情可以看为什么要使用_mm_malloc? (与_aligned_malloc,alligned_alloc或posix_memalign相对)
如何实现最高传输速率

convolve_cols_3x3 列卷积

void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
  using namespace std;
  assert( w % 16 == 0 && "width must be multiple of 16!" );
  const int w_chunk  = w/16;
  __m128i* 	i0       = (__m128i*)( in );
  __m128i* 	i1       = (__m128i*)( in ) + w_chunk*1;
  __m128i* 	i2       = (__m128i*)( in ) + w_chunk*2;
  __m128i* result_h  = (__m128i*)( out_h ) + 2*w_chunk;
  __m128i* result_v  = (__m128i*)( out_v ) + 2*w_chunk;
  __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
  for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
    *result_h     = _mm_setzero_si128();
    *(result_h+1) = _mm_setzero_si128();
    *result_v     = _mm_setzero_si128();
    *(result_v+1) = _mm_setzero_si128();
    __m128i ilo, ihi;
    unpack_8bit_to_16bit( *i0, ihi, ilo ); 
    unpack_8bit_to_16bit( *i0, ihi, ilo );
    *result_h     = _mm_add_epi16( ihi, *result_h );
    *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
    unpack_8bit_to_16bit( *i1, ihi, ilo );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
    unpack_8bit_to_16bit( *i2, ihi, ilo );
    *result_h     = _mm_sub_epi16( *result_h, ihi );
    *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
    *result_v     = _mm_add_epi16( *result_v, ihi );
    *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
  }
}

convolve_101_row_3x3 101的行卷积

 // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
    // This one works on 16bit input and 8bit output.
    // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
    void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
      assert( w % 16 == 0 && "width must be multiple of 16!" );
      const __m128i*  i0 = (const __m128i*)(in);
      const int16_t* 	i2 = in+2;
      uint8_t* result    = out + 1;
      const int16_t* const end_input = in + w*h;
      const size_t blocked_loops = (w*h-2)/16;
      __m128i offs = _mm_set1_epi16( 128 );
      for( size_t i=0; i != blocked_loops; i++ ) {
        __m128i result_register_lo;
        __m128i result_register_hi;
        __m128i i2_register;

        i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_lo  = *i0;
        result_register_lo  = _mm_sub_epi16( result_register_lo, i2_register );
        result_register_lo  = _mm_srai_epi16( result_register_lo, 2 );
        result_register_lo  = _mm_add_epi16( result_register_lo, offs );
 
        i0 += 1;
        i2 += 8;
        
        i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_hi  = *i0;
        result_register_hi  = _mm_sub_epi16( result_register_hi, i2_register );
        result_register_hi  = _mm_srai_epi16( result_register_hi, 2 );
        result_register_hi  = _mm_add_epi16( result_register_hi, offs );

        i0 += 1;
        i2 += 8;
        
        pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
        _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );

        result += 16;
      }

      for( ; i2 < end_input; i2++, result++) {
        *result = ((*(i2-2) - *i2)>>2)+128;
      }
    }

convolve_121_row_3x3 121的行卷积

 // convolve image with a (1,2,1) row vector. Result is accumulated into output.
    // This one works on 16bit input and 8bit output.
    // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
    void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
      assert( w % 16 == 0 && "width must be multiple of 16!" );
      const __m128i* i0 = (const __m128i*)(in);
      const int16_t* i1 = in+1;
      const int16_t* i2 = in+2;
      uint8_t* result   = out + 1;
      const int16_t* const end_input = in + w*h;
      const size_t blocked_loops = (w*h-2)/16;
      __m128i offs = _mm_set1_epi16( 128 );
      for( size_t i=0; i != blocked_loops; i++ ) {
        __m128i result_register_lo;
        __m128i result_register_hi;
        __m128i i1_register;
        __m128i i2_register;
        
        i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
        i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
        result_register_lo &
  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值