SSE2的简单理解，主要针对opencv 中的优化

最新推荐文章于 2023-11-03 11:51:26 发布

既然如此

最新推荐文章于 2023-11-03 11:51:26 发布

阅读量4.4k

点赞数

分类专栏：学习文章标签： oepncv优化 SSE2加速

本文链接：https://blog.csdn.net/yeyang911/article/details/18318011

版权

学习专栏收录该内容

81 篇文章 3 订阅

订阅专栏

加速主要是一条128位的指令可以一次处理多个运算

比如说int是32位的 128/32=4 如果是加法就可以一条指令就可以处理4个整数的加法

-----------------------------------------------------------------Load--------------------------------------------------------------------------------------------------------------------------

__m128i _mm_load_si128 (__m128i *p);

Loads 128-bit value. Address p must be 16-byte aligned. 必须对齐

r := *p

__m128i _mm_loadu_si128 (__m128i *p);

Loads 128-bit value. Address p does not need be 16-byte aligned.可以不对齐

r := *p

__m128i _mm_loadl_epi64(__m128i const*p);

Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.少于64位的向前补零

r0:= *p[63:0]
r1:=0x0

-------------------------------------------------------------Set---------------------------------------------------------------------------------------------------

__m128i _mm_set_epi64 (__m64 q1, __m64 q0);

Sets the 2 64-bit integer values.

#include <iostream>#include <emmintrin.h>#include <smmintrin.h>using namespace std;
 
int main()
{
    __m64 a, b;
    a.m64_u64 = 0xA;
    b.m64_u64 = 0xB;
 
    __m128i c = _mm_set_epi64(a, b);
    cout << c.m128i_i64[0] << c.m128i_i64[1] << endl;
 
    return 0;
}

__m128i _mm_set_epi32 (int i3, int i2, int i1, int i0);

Sets the 4 signed 32-bit integer values.

r0 := i0
r1 := i1
r2 := i2
r3 := i3

__m128i _mm_set_epi16 (short w7, short w6,    short w5, short w4,   short w3, short w2,   short w1, short w0);

Sets the 8 signed 16-bit integer values.

r0 := w0
r1 := w1
...
r7 := w7

__m128i _mm_set_epi8 （类似上面的）

Sets the 16 signed 8-bit integer values.

__m128i _mm_set1_epi32 (int i);

Sets the 4 signed 32-bit integer values to i .

r0 := i
r1 := i
r2 := i
r3 := I

__m128i _mm_set1_epi64 (int i);

__m128i _mm_set1_epi16 (int i);

__m128i _mm_set1_epi8 (int i); （类似set1_epi32）

--------------------------------------------------------------------Store----------------------------------------------------------------------------------

void _mm_store_si128 (__m128i *p, __m128i a);

Stores 128-bit value. Address p must be 16-byte aligned.

*p := a

void _mm_storeu_si128 (__m128i *p, __m128i a);（参考Load里面的）

void _mm_storel_epi64(__m128i *p, __m128i a);（参考Load里面的）

void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p);

The high bit of each byte in the selector n determines whether the corresponding byte in d will be stored. Address p does not need to be 16-byte aligned.

if (n0[7]) p[0] := d0
if (n1[7]) p[1] := d1
...
if (n15[7]) p[15] := d15

-----------------------------------------------Logical--------------------------------------------------------------

Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.

__m128i _mm_and_si128 (__m128i a, __m128i b);

r := a & b

__m128i _mm_andnot_si128 (__m128i a, __m128i b);

r := (~a) & b

__m128i _mm_or_si128 (__m128i a, __m128i b);

r := a | b

__m128i _mm_xor_si128 ( __m128i a, __m128i b);

r := a ^ b

-----------------------------------------------------------------------------------------------------------------------------------------------

__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b);

Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.

r0 := a0 ; r1 := b0
r2 := a1 ; r3 := b1
r4 := a2 ; r5 := b2
r6 := a3 ; r7 := b3

__m128 _mm_cvtepi32_ps (__m128i a);

Converts the four signed 32-bit integer values of a to single-precision, floating-point values.

r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3

还需要别的可以访问下面的链接：

http://msdn.microsoft.com/en-us/library/hfhxtdwx(v=vs.100).aspx

既然如此

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录