typedef union __declspec(intrin_type)_CRT_ALIGN(16)__m128i {
__int8 m128i_i8[16]; //char
__int16 m128i_i16[8]; //short
__int32 m128i_i32[4]; //int
__int64 m128i_i64[2]; //long long
unsigned __int8 m128i_u8[16]; //uchar
unsigned __int16 m128i_u16[8]; //ushort
unsigned __int32 m128i_u32[4]; //uint
unsigned __int64 m128i_u64[2]; //ulonglong
}__m128i
typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {
double m128d_f64[2];
} __m128d;
(1)__m128i _mm_add_epi16(__m128i a, __m128i b):将a中8个16位有符号或无符号整数与对应的b中的8个16位有符号或无符号整数相加
(2)_mm_sub_epi16(_m128i S0,_m128i S1):将S0和S1中对应位置的16bit整数分别相减
(3)_mm_unpacklo_epi8(_m128i S0,_m128i S1):将S0和S1的低64位数以8位为单位进行交错
(4)__m128i _mm_set_epi16(short w7,short w6,short w5,short w4,short w3, short w2,short w1, short w0);
使用8个short(16bits)变量来设置__m128i变量;
(5)_mm_mullo_epi16(_m128i S0,_m128i S1):
返回一个_m128i的寄存器,它含有8个16位的整数,分别为S0和S1对应位置的16位的整数相乘结果的低16位数据;
(6)_mm_srli_epi16(__m128i a, int count);
返回一个_m128i的寄存器,将寄存器a中的8个16bit整数按照count进行相同的逻辑右移,移位填充值为0。
(7)_mm_shuffle_epi8(sclaL, _mm_setr_epi8(0, 6, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
0-取第0位值,6-取第6位值,-1不取值
(8)__m128i _mm_or_si128 ( __m128i a, __m128i b); 将a中128 bits数值与b中的128bits数值对应位按位做"或(OR)"运算;
(9)加载:
__m128 _mm_load_ss(float *p) //将一个单精度浮点数加载到寄存器的第一个字节,其它三个字节清零(r0 := *p, r1 := r2 := r3 := 0.0)
__m128 _mm_load_ps(float *p) //将四个单精度浮点数加载到寄存器(r0 := p[0], r1 := p[1], r2 := p[2], r3 := p[3])
__m128 _mm_load1_ps(float *p)//将p地址的值加载到暂存器的四个字节,需要多条指令完成。从性能考虑,在内层循环不要使用这类指令(r0 := r1 := r2 := r3 := *p)
__m128 _mm_loadh_pi(__m128 a, __m64 *p)//
__m128 _mm_loadl_pi(__m128 a, __m64 *p)//
__m128 _mm_loadr_ps(float *p)//以_mm_load_ps反向的顺序加载,需要多条指令完成。(r0 := p[3], r1 := p[2], r2 := p[1], r3 := p[0])
__m128 _mm_loadu_ps(float *p)//_mm_load_ps一样的加载,但是不要求地址是16字节对齐
__m128i _mm_load_si128 (__m128i *p); //加载128bits值,p必须是一个16-bit对齐的一个变量的地址
__m128i _mm_loadu_si128 (__m128i *p);//加载128bits值;p不用是一个16-bit对齐的一个变量的地址
__m128i _mm_loadl_epi64(__m128i const*p);//加载p所指向的变量的低64位数据到返回值变量的低64位中,高64位赋值为0;
(10)设置:
__m128 _mm_set_ss(float w)//对应于_mm_load_ss的功能,不需要字节对齐,需要多条指令(r0 = w, r1 = r2 = r3 = 0.0)
__m128 _mm_set_ps(float z, float y, float x, float w)//对应于_mm_load_ps的功能,参数是四个单独的单精度浮点数,所以也不需要字节对齐,需要多条指令。(r0=w, r1 = x, r2 = y, r3 = z,注意顺序)
__m128 _mm_set1_ps(float w)//对应于_mm_load1_ps的功能,不需要字节对齐,需要多条指令。(r0 = r1 = r2 = r3 = w)
__m128 _mm_setr_ps(float z, float y, float x, float w)//对应于_mm_loadr_ps功能,不需要字节对齐,需要多条指令。(r0=z, r1 = y, r2 = x, r3 = w,注意顺序)
__m128 _mm_setzero_ps()//清0操作,只需要一条指令。(r0 = r1 = r2 = r3 = 0.0)
_mm_setzero_si128()://将128位值都赋值为0;
(11)储存:
void _mm_store_ss(float *p, __m128 a) //一条指令,*p = a0
void _mm_store_ps(float *p, __m128 a) //一条指令,p[i] = a[i]
void _mm_store1_ps(float *p, __m128 a) //多条指令,p[i] = a0
void _mm_storeh_pi(__m64 *p, __m128 a) //
void _mm_storel_pi(__m64 *p, __m128 a) //
void _mm_storer_ps(float *p, __m128 a) //反向,多条指令
void _mm_storeu_ps(float *p, __m128 a) //一条指令,p[i] = a[i],不要求16字节对齐
void _mm_stream_ps(float *p, __m128 a) //直接写入内存,不改变cache的数据
(12)数据转换:
__mm_cvtss_si32 //单精度浮点数转换为有符号32位整数
__mm_cvttss_si32 //单精度浮点数转换为有符号32位整数(带截断操作)
__mm_cvtpi16_ps //16位_mm_blendv_epi8有符号整数转换为单精度浮点数
__m128i _mm_cvtepu8_epi16(__m128i a);//将8位转为16位
// Packed integer sign-extension
extern __m128i _mm_cvtepi8_epi32 (__m128i);
extern __m128i _mm_cvtepi16_epi32(__m128i);
extern __m128i _mm_cvtepi8_epi64 (__m128i);
extern __m128i _mm_cvtepi32_epi64(__m128i);
extern __m128i _mm_cvtepi16_epi64(__m128i);
extern __m128i _mm_cvtepi8_epi16 (__m128i);
// Packed integer zero-extension
extern __m128i _mm_cvtepu8_epi32 (__m128i);
extern __m128i _mm_cvtepu16_epi32(__m128i);
extern __m128i _mm_cvtepu8_epi64 (__m128i);
extern __m128i _mm_cvtepu32_epi64(__m128i);
extern __m128i _mm_cvtepu16_epi64(__m128i);
extern __m128i _mm_cvtepu8_epi16 (__m128i);
(13)打包
__m128i _mm_packus_epi16 (__m128i a, __m128i b);
r0 := UnsignedSaturate(a0)
r1 := UnsignedSaturate(a1)
...
r7 := UnsignedSaturate(a7)
r8 := UnsignedSaturate(b0)
r9 := UnsignedSaturate(b1)
...
r15 := UnsignedSaturate(b7)
(14)将16位变为8位,所得8位范围为(0-255)或者((-128)-(127))
__m256i res = _mm256_packs_epi16(e1, e21);(char型)
__m256i res = _mm256_packus_epi16(e1, e21);(uchar型)
__m256i res1 = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
(15)位移
a.乘法
__m256i temp = _mm256_srli_epi16(blue_number, 1);(正数乘法)
__m256i temp1 = _mm256_slli_epi16(blue_number, 1);(负数乘法)
b.除法
__m256i temp2 = _mm256_srai_epi16(blue_number, 1);;(负数除法)
(16)
__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
将a,b的8个float值,分别相乘(如果imm8第4-7位为1),然后前4个求和,后4个求和,如果imm8的低0-3位为1就分配这个和值到目标区域。
(17)求倒数
a.extern __m256 __cdecl _mm256_rcp_ps(__m256);
FOR j := 0 to 7
i := j*32
dst[i+31:i] := 1.0 / a[i+31:i]
ENDFOR
dst[MAX:256] := 0
b._mm256_rsqrt_ps
FOR j := 0 to 7
i := j*32
dst[i+31:i] := (1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:256] := 0
(18)相等判断
__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)
FOR j := 0 to 15
i := j*16
dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0
(19)if else
//小于2 等于10
__m256i dau = _mm256_cmpgt_epi16(blue_number,_mm256_set1_epi16(2));
__m256i t = _mm256_and_si256(dau, _mm256_set1_epi16(10));
//大于2 等于30
__m256i dau1 = _mm256_cmpgt_epi16( _mm256_set1_epi16(2), blue_number);
__m256i t1 = _mm256_and_si256(dau1, _mm256_set1_epi16(30));
__m256i t3 = _mm256_or_si256(t, t1);
实例:
1)__m128i S0 = _mm_loadl_epi64((__m128i *)(Src + 0 * StrideS)); // 0 0 0 0 0 0 0 0 A7 A6 A5 A4 A3 A2 A1 A0
2)__m128i S1 = _mm_loadl_epi64((__m128i *)(Src + 1 * StrideS)); // 0 0 0 0 0 0 0 0 B7 B6 B5 B4 B3 B2 B1 B0
3)__m128i S2 = _mm_loadl_epi64((__m128i *)(Src + 2 * StrideS)); // 0 0 0 0 0 0 0 0 C7 C6 C5 C4 C3 C2 C1 C0
4)__m128i S3 = _mm_loadl_epi64((__m128i *)(Src + 3 * StrideS)); // 0 0 0 0 0 0 0 0 D7 D6 D5 D4 D3 D2 D1 D0
5)__m128i S01 = _mm_unpacklo_epi8(S0, S1); // B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
6)__m128i S23 = _mm_unpacklo_epi8(S2, S3); // D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0
7)__m128i S0123L = _mm_unpacklo_epi16(S01, S23); // D3 C3 B3 A3 D2 C2 B2 A2 D1 C1 B1 A1 D0 C0 B0 A0
8)__m128i S0123H = _mm_unpackhi_epi16(S01, S23); // D7 C7 B7 A7 D6 C6 B6 A6 D5 C5 B5 A5 D4 C4 B4 A4
9)__m128i sumaL = _mm_setr_epi8(13, 0,2 ,1,0, 11, 13, 0, 2, 1, 0, 11, 12,13, 0, 2 ); //8位的值为13, 0,2 ,1,0, 11, 13, 0, 2, 1, 0, 11, 12,13, 0, 2 ,16位的值位13+0,2 + 256,(二进制)11 + 0 ,(二进制)0 + 13;
10)__m128i v = _mm_blendv_epi8(b, a, bmask1);
const __m128i bmask1 = _mm_setr_epi8(255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
const __m128i b = _mm_setr_epi8( 49 60 47 49 57 37 42 59 35 45 52 38 42 50 43 41);
const __m128i a = _mm_setr_epi8(42 46 52 38 40 47 34 42 50 38 44 54 44 45 54 44);
__m128i v = (42 46 52 38 40 47 42 59 35 45 52 38 42 50 43 41);
11)__m256i p0 = _mm256_permute4x64_epi64(BGR_16, 0x1B); (0x1B = 00 01 10 11 = 0 1 2 3)
__m256i BGR_16 = (42 46 52 38 40 47 34 42 50 38 44 54 44 45 54 44 49 60 47 49 57 37 42 59 35 45 52 38 42 50 43 41);
__m256i p0 = (35 45 52 38 42 50 43 41 49 60 47 49 57 37 42 59 50 38 44 54 44 45 54 44 42 46 52 38 40 47 34 42);
12)__m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8)
dst[255:0] := a[255:0]
CASE (imm8[0]) OF
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
13)__m256i a0 = _mm256_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32);
__m256i a1 = _mm256_setr_epi8(51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82);
__m256i b0 = _mm256_unpacklo_epi8(a0, a1);
__m256i b1 = _mm256_unpackhi_epi8(a0, a1);
b0: 1 51 2 52 3 53 4 54 5 55 6 56 7 57 8 58 17 67 18 68 19 69 20 70 21 71 22 72 23 73 24 74
b1: 9 59 10 60 11 61 12 62 13 63 14 64 15 65 16 66 25 75 26 76 27 77 28 78 29 79 30 80 31 81 32 82
指令集笔记
最新推荐文章于 2023-05-05 08:38:14 发布