- #include <mmintrin.h> //MMX
- #include <xmmintrin.h> //SSE(include mmintrin.h)
- #include <emmintrin.h> //SSE2(include xmmintrin.h)
- #include <pmmintrin.h> //SSE3(include emmintrin.h)
- #include <tmmintrin.h>//SSSE3(include pmmintrin.h)
- #include <smmintrin.h>//SSE4.1(include tmmintrin.h)
- #include <nmmintrin.h>//SSE4.2(include smmintrin.h)
- #include <wmmintrin.h>//AES(include nmmintrin.h)
- #include <immintrin.h>//AVX(include wmmintrin.h)
- #include <intrin.h>//(include immintrin.h)
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
加载参数低64bit到低64dit,高64bit置0
__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)
a与b每按8bit交错排布到返回值,低bit对应低bit,先a再b
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
每16bit相减
__m128i _mm_shuffle_epi32 (__m128i a, int imm8)
将__m128i 分为4 * 32 bit,a0 a1 a2 a3
这4个int有一个位置代号:0x11 0x10 0x01 0x00
如果imm = 0x00 0x00 0x00 0x00 ,则得到的__m128i则是(r0 = a3,r1=a3,r2=a3,r3=a3) .
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
每32bit比较,若a>b则相应的32bit置全F,否则置0
__m128i _mm_xor_si128 (__m128i a, __m128i b)
每bit异或
int _mm_cvtsi128_si32 (__m128i a)
取低32位赋值
__m128i _mm_cvtsi32_si128 (int a)
将a赋值给0:31,32-127赋值0
__m128i _mm_srli_epi64 (__m128i a, int imm8)
每64位右移imm8,空出的位置补零
__m128i _mm_srli_epi32 (__m128i a, int imm8)
每32位右移imm8,空出的位置补零
__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b)
取a,b的高64位,每32bit交错排列,如:
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
__m128i _mm_srli_si128 (__m128i a, int imm8)
右移8*imm8位
int _mm_extract_epi16 (__m128i a, int imm8)
a右移imm8*16bit后取低16bit赋值给目标int的低16位,高16位置零
dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
dst[31:16] := 0
__m128i _mm_slli_epi32 (__m128i a, int imm8)
每个32左移imm8,空余补零
__m128 _mm_set_ps1 (float a)
将a重复拷贝到4个32中
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
__m128 _mm_max_ps (__m128 a, __m128 b)
每32bit取较大值
__m128 _mm_set_ss (float a)
将a赋给目标的0:31 bit