【SSE指令】

本文详细介绍了Intel Intrinsics中用于单精度浮点数的数学运算,如加法、减法、乘法、除法、平方根、倒数及平方根倒数等。此外,还涵盖了逻辑操作,包括位与、位或、位异或和位非,并提供了比较指令和转换指令的说明。这些指令在高性能计算和数值分析中起到关键作用。
摘要由CSDN通过智能技术生成

更多指令请看官网链接:Intel® Intrinsics Guide

数据类型

__m64 任意整型

__m128 4 位 32 bit 浮点型

__m128d 2 位 64 bit 浮点型

__m128i 任意整型

数学运算

__m128 _mm_add_ss(__m128 a, __m128 b);

单精度浮点 低位加法

result = [ a0+b0 , a1 , a2 , a3 ]

__m128 _mm_add_ps(__m128 a, __m128 b);

单精度浮点 加法

result = [ a0+b0 , a1+b1 , a2+b2 , a3+b3 ]

__m128 _mm_sub_ss(__m128 a, __m128 b);

单精度浮点 低位减法

result = [ a0-b0 , a1 , a2 , a3 ]

__m128 _mm_sub_ps(__m128 a, __m128 b);

单精度浮点 低位减法

result = [ a0-b0 , a1-b1 , a2-b2 , a3-b3 ]

__m128 _mm_mul_ss(__m128 a, __m128 b);

单精度浮点 低位乘法

result = [ a0*b0 , a1 , a2 , a3 ]

__m128 _mm_mul_ps(__m128 a, __m128 b);

单精度浮点乘法

result = [ a0*b0 , a1*b1 , a2*b2 , a3*b3 ]

__m128 _mm_div_ss(__m128 a, __m128 b);

单精度浮点 低位除法

result = [ a0/b0 , a1 , a2 , a3 ]

_m128 _mm_div_ps(__m128 a, __m128 b);

单精度浮点除法

result = [ a0/b0 , a1/b1 , a2/b2 , a3/b3 ]

__m128 _mm_sqrt_ss(__m128 a);

单精度浮点 低位平方根

result = [ sqrt(a0) , a1 , a2 , a3 ]

__m128 _mm_sqrt_ps(__m128 a);

单精度浮点平方根

result = [ sqrt( a0) , sqrt(a1) , sqrt( a2) , sqrt(a3) ]

__m128 _mm_rcp_ss(__m128 a);

单精度浮点 低位倒数近似值

result = [ recip(a0) , a1 , a2 , a3 ]

__m128 _mm_rcp_ps(__m128 a);

单精度浮点 倒数近似值

result = [ recip(a0) , recip(a1) , recip(a2) , recip(a3) ]

__m128 _mm_rsqrt_ss(__m128 a);

单精度浮点 低位平方根倒数近似值

result = [ recip(sqrt(a0)) , a1 , a2 , a3 ]

__m128 _mm_rsqrt_ps(__m128 a);

单精度浮点 平方根倒数近似值

result = [ recip(sqrt(a0)) , recip(sqrt(a1)) , recip(sqrt(a2)) , recip(sqrt(a3)) ]

__m128 _mm_min_ss(__m128 a, __m128 b);

单精度浮点 低位最小值

result = [ min(a0,b0) , a1 , a2 , a3 ]

__m128 _mm_min_ps(__m128 a, __m128 b);

单精度浮点 最小值

result = [ min(a0,b0) , min(a1,b1) , min(a2,b2) , min(a3,b3) ]

__m128 _mm_max_ss(__m128 a, __m128 b);

单精度浮点 低位最大值

result = [ max(a0,b0) , a1 , a2 , a3 ]

__m128 _mm_max_ps(__m128 a, __m128 b);

单精度浮点 最大值

result = [ max(a0,b0) , max(a1,b1) , max(a2,b2) , max(a3,b3) ]

逻辑指令

__m128 _mm_and_ps(__m128 a, __m128 b);

位与

result = [ a&b ]

__m128 _mm_andnot_ps(__m128 a, __m128 b);

a非 与 b

result = [ (~a0)&b0 , (~a1)&b1 , (~a2)&b2 (~a3)&b3 ]

__m128 _mm_or_ps(__m128 a, __m128 b);

位或

result = [ a|b ]

__m128 _mm_xor_ps(__m128 a, __m128 b);

位异或

result = [ a^b ]

比较指令

指令

作用

类型

TRUE

_mm_cmpeq_ss

Equal

float 低位

0xffffffff

_mm_cmpeq_ps

Equal

float

0xffffffff

_mm_cmplt_ss

Less Than

float 低位

0xffffffff

_mm_cmplt_ps

Less Than

float

0xffffffff

_mm_cmple_ss

Less Than or Equal

float 低位

0xffffffff

_mm_cmple_ps

Less Than or Equal

float

0xffffffff

_mm_cmpgt_ss

Greater Than

float 低位

0xffffffff

_mm_cmpgt_ps

Greater Than

float

0xffffffff

_mm_cmpge_ss

Greater Than or Equal

float 低位

0xffffffff

_mm_cmpge_ps

Greater Than or Equal

float

0xffffffff

_mm_cmpneq_ss

Not Equal

float 低位

0xffffffff

_mm_cmpneq_ps

Not Equal

float

0xffffffff

_mm_cmpnlt_ss

Not Less Than

float 低位

0xffffffff

_mm_cmpnlt_ps

Not Less Than

float

0xffffffff

_mm_cmpnle_ss

Not Less Than or Equal

float 低位

0xffffffff

_mm_cmpnle_ps

Not Less Than or Equal

float

0xffffffff

_mm_cmpngt_ss

Not Greater Than

float 低位

0xffffffff

_mm_cmpngt_ps

Not Greater Than

float

0xffffffff

_mm_cmpnge_ss

Not Greater Than or Equal

float 低位

0xffffffff

_mm_cmpnge_ps

Not Greater Than or Equal

float

0xffffffff

_mm_cmpord_ss

Ordered

float 低位

0xffffffff

_mm_cmpord_ps

Ordered

float

0xffffffff

_mm_cmpunord_ss

Unordered

float 低位

0xffffffff

_mm_cmpunord_ps

Unordered

float

0xffffffff

_mm_comieq_ss

Equal

float 低位

0x1

_mm_comilt_ss

Less Than

float 低位

0x1

_mm_comile_ss

Less Than or Equal

float 低位

0x1

_mm_comigt_ss

Greater Than

float 低位

0x1

_mm_comige_ss

Greater Than or Equal

float 低位

0x1

转换指令

int _mm_cvtss_si32(__m128 a);

返回低位 有符号32bit,近似

result = [ (int)a0 ]

__int64 _mm_cvtss_si64(__m128 a);

返回低位 有符号64bit,近似

result = [ (__int64)a0 ]

__m64 _mm_cvtps_pi32(__m128 a);

返回低位 有符号32bit,近似

result = [ (int)a0 , (int)a1 ]

int _mm_cvttss_si32(__m128 a);

返回低位 有符号32bit,截断

result = [ (int)a0 ]

__int64 _mm_cvttss_si64(__m128 a);

返回低位 有符号64bit,截断

result = [ (__int64)a0 ]

__m64 _mm_cvttps_pi32(__m128 a);

返回低位 有符号32bit,截断

result = [ (int)a0 , (int) a1]

__m128 _mm_cvtsi32_ss(__m128 a, int b);

32bit 整型转化为低位单精度浮点

result = [ (float)b , a1 , a2 , a3 ]

__m128 _mm_cvtsi64_ss(__m128 a, __int64 b);

64bit 整型转化为低位单精度浮点

result = [ (float)b , a1 , a2 , a3 ]

__m128 _mm_cvtpi32_ps(__m128 a, __m64 b);

两个32bit整型转化为单精度浮点

result = [(float)b0 , (float)b1 , a2 , a3 ]

__m128 _mm_cvtpi16_ps(__m64 a);

四个有符号16bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpu16_ps(__m64 a);

四个无符号16bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpi8_ps(__m64 a);

四个有符号8bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpu8_ps(__m64 a);

四个无符号8bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b);

四位 32bit整型转化为单精度浮点

result = [ (float)a0 , (float)a1 , (float)b0 , (float)b1 ]

__m64 _mm_cvtps_pi16(__m128 a);

将单精度浮点转位有符号16bit整型

result = [ (short)a0 , (short)a1 , (short)a2 , (short)a3 ]

__m64 _mm_cvtps_pi8(__m128 a);

四个单精度浮点转为四个有符号8bit整型

result = [ (char)a0 , (char)a1 , (char)a2 , (char)a3 ]

float _mm_cvtss_f32(__m128 a);

返回第一个值

加载指令

__m128 _mm_loadh_pi(__m128 a, __m64 const *p);

加载4个单精度浮点

result = [ a0 , a1 , p0 , p1 ]

__m128 _mm_loadl_pi(__m128 a, __m64 const *p);

加载4个单精度浮点

result = [ p0 , p1 , a2 , a3 ]

__m128 _mm_load_ss(float * p);

加载单精度浮点到低位

result = [ p0 , 0.0 , 0.0 , 0.0 ]

__m128 _mm_load1_ps(float * p);

加载并复制

result = [ p0 , p0 , p0 , p0 ]

__m128 _mm_load_ps(float * p);

加载4个单精度浮点,必须16字节对齐

result = [ p0 , p1 , p2 , p3 ]

__m128 _mm_loadu_ps(float * p);

加载4个单精度浮点,不需要16字节对齐

result = [ p0 , p1 , p2 , p3 ]

__m128 _mm_loadr_ps(float * p);

加载4个单精度浮点翻转,必须16字节对齐

result = [ p3 , p2 , p1 , p0 ]

设置指令

__m128 _mm_set_ss(float w);

设置低位单精度浮点

result = [ w , 0.0 , 0.0 , 0.0 ]

__m128 _mm_set1_ps(float w);

设置并复制

result = [ w , w , w , w ]

__m128 _mm_set_ps(float z, float y, float x, float w);

设置4个单精度浮点

result = [ w , x , y , z ]

__m128 _mm_setr_ps(float z, float y, float x, float w);

设置4个单精度浮点翻转

result = [ z , y , x , w ]

__m128 _mm_setzero_ps(void);

清理4个单精度浮点

result = [ 0.0 , 0.0 , 0.0 , 0.0 ]

存储指令

void _mm_storeh_pi(__m64 *p, __m128 a);

存储高位单精度浮点

result = [ a2 , a3 ]

void _mm_storel_pi(__m64 *p, __m128 a);

低两位存入p

result = [ a0 , a1 ]

void _mm_store_ss(float * p, __m128 a);

存低位

result = [ a0 ]

void _mm_store1_ps(float * p, __m128 a);

存单精度浮点

result = [ a0 , a0 , a0 , a0 ]

void _mm_store_ps(float *p, __m128 a);

存4位单精度浮点,必须16字节对齐

result = [ a0 , a1 , a2 , a3 ]

void _mm_storeu_ps(float *p, __m128 a);

存4位单精度浮点,不需要16字节对齐

result = [ a0 , a1 , a2 , a3 ]

void _mm_storer_ps(float * p, __m128 a);

存4位单精度浮点 翻转,必须16字节对齐

result = [ a3 , a2 , a1 , a0 ]

缓存支持

void _mm_prefetch(char const*a, int sel);

Loads one cache line of data from address a to a location “closer” to the processor. The value sel specifies the type of prefetch operation: the constants _MM_HINT_T0 , _MM_HINT_T1 , _MM_HINT_T2 , _MM_HINT_NTA ,and _MM_HINT_ET0 should be used for systems based on IA-32 architecture, and correspond to the type of prefetch instruction.

void _mm_stream_pi(__m64 *p, __m64 a);

Stores the data in a to the address p without polluting the caches. This intrinsic requires you to empty the multimedia state for the MMXTM register. See the topic The EMMS Instruction: Why You Need It.

void _mm_stream_ps(float *p, __m128 a);

Stores the data in a to the address p without polluting the caches. The address must be 16-byte-aligned.

void _mm256_stream_ps(float *p, __m256 a);

Stores the data in a to the address p without polluting the caches. The address must be 32-byte (VEX.256 encoded version) aligned.

void _mm_sfence(void);

Guarantees that every preceding store is globally visible before any subsequent store.

整型指令

int _mm_extract_pi16(__m64 a, int imm);

返回32bit整型

result = [ (n==0) ? a0 : ( (n==1) ? a1 : ( (n==2) ? a2 : a3 ) ) ]

__m64 _mm_insert_pi16(__m64 a, int d, int n);

将16bit d 插入a

result = [ (n==0) ? d : a0 , (n==1) ? d : a1 , (n==2) ? d : a2 , (n==3) ? d : a3 ]

__m64 _mm_max_pi16(__m64 a, __m64 b);

计算最大值

result = [ max(a0,b0) , max(a1,b1) , max(a2,b2) , max(a3,b3) ]

__m64 _mm_max_pu8(__m64 a, __m64 b);

计算无符号最大值

result = [ max(a0,b0) , max(a1,b1) , … , max(a7,b7) ]

__m64 _mm_min_pi16(__m64 a, __m64 b);

计算最小值

result = [ min(a0,b0) , min(a1,b1) , min(a2,b2) , min(a3,b3) ]

__m64 _mm_min_pu8(__m64 a, __m64 b);

计算无符号最小值

result = [ min(a0,b0) , min(a1,b1) , … , min(a7,b7) ]

__m64 _mm_movemask_pi8(__m64 b);

创建8 bit掩码

result = [ sign(a7)<<7 | sign(a6)<<6 |… | sign(a0) ]

__m64 _mm_mulhi_pu16(__m64 a, __m64 b)

无符号16bit相乘,返回32bit结果的前16bit

result = [ (a0 * b0)[0:15] , (a1 * b1)[0:15] , (a2 * b2)[0:15] , (a3 * b3)[0:15] ]

__m64 _mm_shuffle_pi16(__m64 a, int n);

result = [ word (n&0x3) of a , word ((n>>2)&0x3) of a , word ((n>>4)&0x3) of a , word ((n>>6)&0x3) of a ]

void _mm_maskmove_si64(__m64 d, __m64 n, char *p);

条件存储字节到p

result = [ if(sign(n0)) p[0] = d0 , if(sign(n1)) p[1] = d1 , … , if(sign(n1)) p[1] = d1 ]

__m64 _mm_avg_pu8(__m64 a, __m64 b);

计算近似均值

result = [ (t >> 1) | (t &0x01) , where t =(unsigned char)a0 +(unsigned char)b0 , (t >> 1) | (t &0x01) , where t =(unsigned char)a1 +(unsigned char)b1 , … , (t >> 1) | (t &0x01) , where t =(unsigned char)a7 +(unsigned char)b7 ]

__m64 _mm_avg_pu16(__m64 a, __m64 b);

计算近似均值

result = [ (t >> 1) | (t &0x01) , where t =(unsigned char)a0 +(unsigned char)b0 , (t >> 1) | (t &0x01) , where t =(unsigned char)a1 +(unsigned char)b1 , … , (t >> 1) | (t &0x01) , where t =(unsigned char)a3 +(unsigned char)b3 ]

__m64 _mm_sad_pu8(__m64 a, __m64 b);

绝对差累加

result = [ abs(a0-b0) +… +abs(a7-b7) , 0 , 0 , 0 ]

杂项

__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);

按照 imm8指示从 a 选择浮点

__m128 _mm_unpackhi_ps(__m128 a, __m128 b);

交替高位

result = [ a2 , b2 , a3 , b3 ]

__m128 _mm_unpacklo_ps(__m128 a, __m128 b);

交替低位

result = [ a0 , b0 , a1 , b1 ]

__m128 _mm_move_ss( __m128 a, __m128 b);

设置低字节

result = [ b0 , a1 , a2 , a3 ]

__m128 _mm_movehl_ps(__m128 a, __m128 b);

设置两字节

result = [ b2 , b3 , a2 , a3 ]

__m128 _mm_movelh_ps(__m128 a, __m128 b);

设置两字节

result = [ a0 , a1 , b0 , b1 ]

int _mm_movemask_ps(__m128 a);

创建四位掩码

result = [ sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0) ]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

xhgen

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值