【SSE指令】

xhgen

已于 2023-03-25 09:17:45 修改

阅读量1k

点赞数 1

分类专栏： SSE 文章标签： c++

于 2022-04-11 11:06:06 首次发布

本文链接：https://blog.csdn.net/qq_42222110/article/details/124094240

版权

SSE 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本文详细介绍了Intel Intrinsics中用于单精度浮点数的数学运算，如加法、减法、乘法、除法、平方根、倒数及平方根倒数等。此外，还涵盖了逻辑操作，包括位与、位或、位异或和位非，并提供了比较指令和转换指令的说明。这些指令在高性能计算和数值分析中起到关键作用。

摘要由CSDN通过智能技术生成

更多指令请看官网链接：Intel® Intrinsics Guide

数据类型

__m64 任意整型

__m128 4 位 32 bit 浮点型

__m128d 2 位 64 bit 浮点型

__m128i 任意整型

数学运算

__m128 _mm_add_ss(__m128 a, __m128 b);

单精度浮点低位加法

result = [ a0+b0 , a1 , a2 , a3 ]

__m128 _mm_add_ps(__m128 a, __m128 b);

单精度浮点加法

result = [ a0+b0 , a1+b1 , a2+b2 , a3+b3 ]

__m128 _mm_sub_ss(__m128 a, __m128 b);

单精度浮点低位减法

result = [ a0-b0 , a1 , a2 , a3 ]

__m128 _mm_sub_ps(__m128 a, __m128 b);

单精度浮点低位减法

result = [ a0-b0 , a1-b1 , a2-b2 , a3-b3 ]

__m128 _mm_mul_ss(__m128 a, __m128 b);

单精度浮点低位乘法

result = [ a0*b0 , a1 , a2 , a3 ]

__m128 _mm_mul_ps(__m128 a, __m128 b);

单精度浮点乘法

result = [ a0*b0 , a1*b1 , a2*b2 , a3*b3 ]

__m128 _mm_div_ss(__m128 a, __m128 b);

单精度浮点低位除法

result = [ a0/b0 , a1 , a2 , a3 ]

_m128 _mm_div_ps(__m128 a, __m128 b);

单精度浮点除法

result = [ a0/b0 , a1/b1 , a2/b2 , a3/b3 ]

__m128 _mm_sqrt_ss(__m128 a);

单精度浮点低位平方根

result = [ sqrt(a0) , a1 , a2 , a3 ]

__m128 _mm_sqrt_ps(__m128 a);

单精度浮点平方根

result = [ sqrt( a0) , sqrt(a1) , sqrt( a2) , sqrt(a3) ]

__m128 _mm_rcp_ss(__m128 a);

单精度浮点低位倒数近似值

result = [ recip(a0) , a1 , a2 , a3 ]

__m128 _mm_rcp_ps(__m128 a);

单精度浮点倒数近似值

result = [ recip(a0) , recip(a1) , recip(a2) , recip(a3) ]

__m128 _mm_rsqrt_ss(__m128 a);

单精度浮点低位平方根倒数近似值

result = [ recip(sqrt(a0)) , a1 , a2 , a3 ]

__m128 _mm_rsqrt_ps(__m128 a);

单精度浮点平方根倒数近似值

result = [ recip(sqrt(a0)) , recip(sqrt(a1)) , recip(sqrt(a2)) , recip(sqrt(a3)) ]

__m128 _mm_min_ss(__m128 a, __m128 b);

单精度浮点低位最小值

result = [ min(a0,b0) , a1 , a2 , a3 ]

__m128 _mm_min_ps(__m128 a, __m128 b);

单精度浮点最小值

result = [ min(a0,b0) , min(a1,b1) , min(a2,b2) , min(a3,b3) ]

__m128 _mm_max_ss(__m128 a, __m128 b);

单精度浮点低位最大值

result = [ max(a0,b0) , a1 , a2 , a3 ]

__m128 _mm_max_ps(__m128 a, __m128 b);

单精度浮点最大值

result = [ max(a0,b0) , max(a1,b1) , max(a2,b2) , max(a3,b3) ]

逻辑指令

__m128 _mm_and_ps(__m128 a, __m128 b);

位与

result = [ a&b ]

__m128 _mm_andnot_ps(__m128 a, __m128 b);

a非与 b

result = [ (~a0)&b0 , (~a1)&b1 , (~a2)&b2 (~a3)&b3 ]

__m128 _mm_or_ps(__m128 a, __m128 b);

位或

result = [ a|b ]

__m128 _mm_xor_ps(__m128 a, __m128 b);

位异或

result = [ a^b ]

比较指令

指令	作用	类型	TRUE
_mm_cmpeq_ss	Equal	float 低位	0xffffffff
_mm_cmpeq_ps	Equal	float	0xffffffff
_mm_cmplt_ss	Less Than	float 低位	0xffffffff
_mm_cmplt_ps	Less Than	float	0xffffffff
_mm_cmple_ss	Less Than or Equal	float 低位	0xffffffff
_mm_cmple_ps	Less Than or Equal	float	0xffffffff
_mm_cmpgt_ss	Greater Than	float 低位	0xffffffff
_mm_cmpgt_ps	Greater Than	float	0xffffffff
_mm_cmpge_ss	Greater Than or Equal	float 低位	0xffffffff
_mm_cmpge_ps	Greater Than or Equal	float	0xffffffff
_mm_cmpneq_ss	Not Equal	float 低位	0xffffffff
_mm_cmpneq_ps	Not Equal	float	0xffffffff
_mm_cmpnlt_ss	Not Less Than	float 低位	0xffffffff
_mm_cmpnlt_ps	Not Less Than	float	0xffffffff
_mm_cmpnle_ss	Not Less Than or Equal	float 低位	0xffffffff
_mm_cmpnle_ps	Not Less Than or Equal	float	0xffffffff
_mm_cmpngt_ss	Not Greater Than	float 低位	0xffffffff
_mm_cmpngt_ps	Not Greater Than	float	0xffffffff
_mm_cmpnge_ss	Not Greater Than or Equal	float 低位	0xffffffff
_mm_cmpnge_ps	Not Greater Than or Equal	float	0xffffffff
_mm_cmpord_ss	Ordered	float 低位	0xffffffff
_mm_cmpord_ps	Ordered	float	0xffffffff
_mm_cmpunord_ss	Unordered	float 低位	0xffffffff
_mm_cmpunord_ps	Unordered	float	0xffffffff
_mm_comieq_ss	Equal	float 低位	0x1
_mm_comilt_ss	Less Than	float 低位	0x1
_mm_comile_ss	Less Than or Equal	float 低位	0x1
_mm_comigt_ss	Greater Than	float 低位	0x1
_mm_comige_ss	Greater Than or Equal	float 低位	0x1

转换指令

int _mm_cvtss_si32(__m128 a);

返回低位有符号32bit,近似

result = [ (int)a0 ]

__int64 _mm_cvtss_si64(__m128 a);

返回低位有符号64bit,近似

result = [ (__int64)a0 ]

__m64 _mm_cvtps_pi32(__m128 a);

返回低位有符号32bit,近似

result = [ (int)a0 , (int)a1 ]

int _mm_cvttss_si32(__m128 a);

返回低位有符号32bit,截断

result = [ (int)a0 ]

__int64 _mm_cvttss_si64(__m128 a);

返回低位有符号64bit,截断

result = [ (__int64)a0 ]

__m64 _mm_cvttps_pi32(__m128 a);

返回低位有符号32bit,截断

result = [ (int)a0 , (int) a1]

__m128 _mm_cvtsi32_ss(__m128 a, int b);

32bit 整型转化为低位单精度浮点

result = [ (float)b , a1 , a2 , a3 ]

__m128 _mm_cvtsi64_ss(__m128 a, __int64 b);

64bit 整型转化为低位单精度浮点

result = [ (float)b , a1 , a2 , a3 ]

__m128 _mm_cvtpi32_ps(__m128 a, __m64 b);

两个32bit整型转化为单精度浮点

result = [(float)b0 , (float)b1 , a2 , a3 ]

__m128 _mm_cvtpi16_ps(__m64 a);

四个有符号16bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpu16_ps(__m64 a);

四个无符号16bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpi8_ps(__m64 a);

四个有符号8bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpu8_ps(__m64 a);

四个无符号8bit整型值转为单精度浮点

result = [(float)a0 , (float)a1 , (float)a2 , (float)a3 ]

__m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b);

四位 32bit整型转化为单精度浮点

result = [ (float)a0 , (float)a1 , (float)b0 , (float)b1 ]

__m64 _mm_cvtps_pi16(__m128 a);

将单精度浮点转位有符号16bit整型

result = [ (short)a0 , (short)a1 , (short)a2 , (short)a3 ]

__m64 _mm_cvtps_pi8(__m128 a);

四个单精度浮点转为四个有符号8bit整型

result = [ (char)a0 , (char)a1 , (char)a2 , (char)a3 ]

float _mm_cvtss_f32(__m128 a);

返回第一个值

加载指令

__m128 _mm_loadh_pi(__m128 a, __m64 const *p);

加载4个单精度浮点

result = [ a0 , a1 , p0 , p1 ]

__m128 _mm_loadl_pi(__m128 a, __m64 const *p);

加载4个单精度浮点

result = [ p0 , p1 , a2 , a3 ]

__m128 _mm_load_ss(float * p);

加载单精度浮点到低位

result = [ p0 , 0.0 , 0.0 , 0.0 ]

__m128 _mm_load1_ps(float * p);

加载并复制

result = [ p0 , p0 , p0 , p0 ]

__m128 _mm_load_ps(float * p);

加载4个单精度浮点,必须16字节对齐

result = [ p0 , p1 , p2 , p3 ]

__m128 _mm_loadu_ps(float * p);

加载4个单精度浮点,不需要16字节对齐

result = [ p0 , p1 , p2 , p3 ]

__m128 _mm_loadr_ps(float * p);

加载4个单精度浮点翻转,必须16字节对齐

result = [ p3 , p2 , p1 , p0 ]

设置指令

__m128 _mm_set_ss(float w);

设置低位单精度浮点

result = [ w , 0.0 , 0.0 , 0.0 ]

__m128 _mm_set1_ps(float w);

设置并复制

result = [ w , w , w , w ]

__m128 _mm_set_ps(float z, float y, float x, float w);

设置4个单精度浮点

result = [ w , x , y , z ]

__m128 _mm_setr_ps(float z, float y, float x, float w);

设置4个单精度浮点翻转

result = [ z , y , x , w ]

__m128 _mm_setzero_ps(void);

清理4个单精度浮点

result = [ 0.0 , 0.0 , 0.0 , 0.0 ]

存储指令

void _mm_storeh_pi(__m64 *p, __m128 a);

存储高位单精度浮点

result = [ a2 , a3 ]

void _mm_storel_pi(__m64 *p, __m128 a);

低两位存入p

result = [ a0 , a1 ]

void _mm_store_ss(float * p, __m128 a);

存低位

result = [ a0 ]

void _mm_store1_ps(float * p, __m128 a);

存单精度浮点

result = [ a0 , a0 , a0 , a0 ]

void _mm_store_ps(float *p, __m128 a);

存4位单精度浮点,必须16字节对齐

result = [ a0 , a1 , a2 , a3 ]

void _mm_storeu_ps(float *p, __m128 a);

存4位单精度浮点,不需要16字节对齐

result = [ a0 , a1 , a2 , a3 ]

void _mm_storer_ps(float * p, __m128 a);

存4位单精度浮点翻转,必须16字节对齐

result = [ a3 , a2 , a1 , a0 ]

缓存支持

void _mm_prefetch(char const*a, int sel);

Loads one cache line of data from address a to a location “closer” to the processor. The value sel specifies the type of prefetch operation: the constants _MM_HINT_T0 , _MM_HINT_T1 , _MM_HINT_T2 , _MM_HINT_NTA ,and _MM_HINT_ET0 should be used for systems based on IA-32 architecture, and correspond to the type of prefetch instruction.

void _mm_stream_pi(__m64 *p, __m64 a);

Stores the data in a to the address p without polluting the caches. This intrinsic requires you to empty the multimedia state for the MMXTM register. See the topic The EMMS Instruction: Why You Need It.

void _mm_stream_ps(float *p, __m128 a);

Stores the data in a to the address p without polluting the caches. The address must be 16-byte-aligned.

void _mm256_stream_ps(float *p, __m256 a);

Stores the data in a to the address p without polluting the caches. The address must be 32-byte (VEX.256 encoded version) aligned.

void _mm_sfence(void);

Guarantees that every preceding store is globally visible before any subsequent store.

整型指令

int _mm_extract_pi16(__m64 a, int imm);

返回32bit整型

result = [ (n==0) ? a0 : ( (n==1) ? a1 : ( (n==2) ? a2 : a3 ) ) ]

__m64 _mm_insert_pi16(__m64 a, int d, int n);

将16bit d 插入a

result = [ (n==0) ? d : a0 , (n==1) ? d : a1 , (n==2) ? d : a2 , (n==3) ? d : a3 ]

__m64 _mm_max_pi16(__m64 a, __m64 b);

计算最大值

result = [ max(a0,b0) , max(a1,b1) , max(a2,b2) , max(a3,b3) ]

__m64 _mm_max_pu8(__m64 a, __m64 b);

计算无符号最大值

result = [ max(a0,b0) , max(a1,b1) , … , max(a7,b7) ]

__m64 _mm_min_pi16(__m64 a, __m64 b);

计算最小值

result = [ min(a0,b0) , min(a1,b1) , min(a2,b2) , min(a3,b3) ]

__m64 _mm_min_pu8(__m64 a, __m64 b);

计算无符号最小值

result = [ min(a0,b0) , min(a1,b1) , … , min(a7,b7) ]

__m64 _mm_movemask_pi8(__m64 b);

创建8 bit掩码

result = [ sign(a7)<<7 | sign(a6)<<6 |… | sign(a0) ]

__m64 _mm_mulhi_pu16(__m64 a, __m64 b)

无符号16bit相乘,返回32bit结果的前16bit

result = [ (a0 * b0)[0:15] , (a1 * b1)[0:15] , (a2 * b2)[0:15] , (a3 * b3)[0:15] ]

__m64 _mm_shuffle_pi16(__m64 a, int n);

result = [ word (n&0x3) of a , word ((n>>2)&0x3) of a , word ((n>>4)&0x3) of a , word ((n>>6)&0x3) of a ]

void _mm_maskmove_si64(__m64 d, __m64 n, char *p);

条件存储字节到p

result = [ if(sign(n0)) p[0] = d0 , if(sign(n1)) p[1] = d1 , … , if(sign(n1)) p[1] = d1 ]

__m64 _mm_avg_pu8(__m64 a, __m64 b);

计算近似均值

result = [ (t >> 1) | (t &0x01) , where t =(unsigned char)a0 +(unsigned char)b0 , (t >> 1) | (t &0x01) , where t =(unsigned char)a1 +(unsigned char)b1 , … , (t >> 1) | (t &0x01) , where t =(unsigned char)a7 +(unsigned char)b7 ]

__m64 _mm_avg_pu16(__m64 a, __m64 b);

计算近似均值

__m64 _mm_sad_pu8(__m64 a, __m64 b);

绝对差累加

result = [ abs(a0-b0) +… +abs(a7-b7) , 0 , 0 , 0 ]

杂项

__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);

按照 imm8指示从 a 选择浮点

__m128 _mm_unpackhi_ps(__m128 a, __m128 b);

交替高位

result = [ a2 , b2 , a3 , b3 ]

__m128 _mm_unpacklo_ps(__m128 a, __m128 b);