MMX Intrinsics各函数介绍

最新推荐文章于 2022-04-06 15:41:26 发布

谢灵枢

最新推荐文章于 2022-04-06 15:41:26 发布

阅读量751

点赞数

分类专栏： CPU Instrunction Set

CPU Instrunction Set 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

原文地址：http://blog.csdn.net/fengbingchun/article/details/20228667

SIMD相关头文件包括：

//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)

#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)

mmintrin.h为MMX头文件，其中__m64的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];    
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

mmintrin.h文件中各函数的介绍

	/* General support intrinsics */
	//Empties the multimedia state,清除MMX寄存器中的内容，即初始化(以避免和浮点数
	//操作发生冲突)，详细说明见参考文献1
	void  _m_empty(void);//_mm_empty
	//Converts the integer object _I to a 64-bit __m64 object, r0=_I, r1=0
	__m64 _m_from_int(int _I);//_mm_cvtsi32_si64
	//Converts the lower 32 bits of the __m64 object _M to an integer, r=_M0
	int   _m_to_int(__m64 _M);//_mm_cvtsi64_si32
	//Packs the four 16-bit values from _MM1 into the lower four 8-bit values of
	//the result with signed saturation, and packs the four 16-bit values from _MM2
	//into the upper four 8-bit values of the result with signed saturation
	__m64 _m_packsswb(__m64 _MM1, __m64 _MM2);//_mm_packs_pi16
	//Packs the two 32-bit values from _MM1 into the lower two 16-bit values of the
	// result with signed saturation, and packs the two 32-bit values from _MM2 into
	// the upper two 16-bit values of the result with signed saturation
	__m64 _m_packssdw(__m64 _MM1, __m64 _MM2);//_mm_packs_pi32
	//Packs the four 16-bit values from _MM1 into the lower four 8-bit values of the
	//result with unsigned saturation, and packs the four 16-bit values from _MM2 into
	//the upper four 8-bit values of the result with unsigned saturation
	__m64 _m_packuswb(__m64 _MM1, __m64 _MM2);//_mm_packs_pu16
	//_MM1=(_MM10, _MM11, _MM12, _MM13, _MM14, _MM15, _MM16, _MM17),
	//_MM2=(_MM20, _MM21, _MM22, _MM23, _MM24, _MM25, _MM26, _MM27),
	//则r=(_MM14, _MM24, _MM15, _MM25, _MM16, _MM26, _MM17, _MM27)
	__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi8 
	//_MM1=(_MM10, _MM11, _MM12, _MM13),_MM10为低位，_MM2=(_MM20, _MM21, _MM22, _MM23),
	//则r=(_MM12, _MM22, _MM13, _MM23)
	__m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi16
	//MM1=(_MM10, _MM11),_MM10为低位，_MM2=(_MM20, _MM21),则r=(_MM11, _MM21)
	__m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi32
	//_MM1=(_MM10, _MM11, _MM12, _MM13, _MM14, _MM15, _MM16, _MM17),
	//_MM2=(_MM20, _MM21, _MM22, _MM23, _MM24, _MM25, _MM26, _MM27),
	//则r=(_MM10, _MM20, _MM11, _MM21, _MM12, _MM22, _MM13, _MM23)
	__m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2);//_mm_unpacklo_pi8
	//_MM1=(_MM10, _MM11, _MM12, _MM13),_MM10为低位，_MM2=(_MM20, _MM21, _MM22, _MM23),
	//则r=(_MM10, _MM20, _MM11, _MM21)
	__m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2);//_mm_unpacklo_pi16
	//MM1=(_MM10, _MM11),_MM10为低位，_MM2=(_MM20, _MM21),则r=(_MM10, _MM20)
	__m64 _m_punpckldq(__m64 _MM1, __m64 _MM2);//mm_unpacklo_pi32

	/* Packed arithmetic intrinsics */
	//Adds the eight 8-bit values in _MM1 to the eight 8-bit values in _MM2
	__m64 _m_paddb(__m64 _MM1, __m64 _MM2);//_mm_add_pi8
	//Adds the four 16-bit values in _MM1 to the four 16-bit values in _MM2
	__m64 _m_paddw(__m64 _MM1, __m64 _MM2);//_mm_add_pi16
	//Adds the two 32-bit values in _MM1 to the two 32-bit values in _MM2
	__m64 _m_paddd(__m64 _MM1, __m64 _MM2);//_mm_add_pi32
	//Adds the eight signed 8-bit values in _MM1 to the eight signed 8-bit values in _MM2
	//and saturates
	__m64 _m_paddsb(__m64 _MM1, __m64 _MM2);//_mm_adds_pi8
	//Adds the four signed 16-bit values in _MM1 to the four signed 16-bit values in _MM2
	//and saturates
	__m64 _m_paddsw(__m64 _MM1, __m64 _MM2);//_mm_adds_pi16
	//Adds the eight unsigned 8-bit values in _MM1 to the eight unsigned 8-bit values 
	//in _MM2 and saturates
	__m64 _m_paddusb(__m64 _MM1, __m64 _MM2);//_mm_adds_pu8
	//Add the four unsigned 16-bit values in _MM1 to the four unsigned 16-bit values 
	//in _MM2 and saturates
	__m64 _m_paddusw(__m64 _MM1, __m64 _MM2);//_mm_adds_pu16
	//Subtracts the eight 8-bit values in _MM2 from the eight 8-bit values in _MM1
	__m64 _m_psubb(__m64 _MM1, __m64 _MM2);//_mm_sub_pi8 
	//Subtracts the four 16-bit values in _MM2 from the four 16-bit values in _MM1
	__m64 _m_psubw(__m64 _MM1, __m64 _MM2);//_mm_sub_pi16
	//Subtracts the two 32-bit values in _MM2 from the two 32-bit values in _MM1
	__m64 _m_psubd(__m64 _MM1, __m64 _MM2);//_mm_sub_pi32
	//Subtracts the eight signed 8-bit values in _MM2 from the eight signed 8-bit
	//values in _MM1 and saturates
	__m64 _m_psubsb(__m64 _MM1, __m64 _MM2);//_mm_subs_pi8
	//Subtracts the four signed 16-bit values in _MM2 from the four signed 16-bit
	//values in _MM1 and saturates
	__m64 _m_psubsw(__m64 _MM1, __m64 _MM2);//_mm_subs_pi16
	//Subtracts the eight unsigned 8-bit values in _MM2 from the eight unsigned 8-bit
	//values in _MM1 and saturates
	__m64 _m_psubusb(__m64 _MM1, __m64 _MM2);//_mm_subs_pu8
	//Subtracts the four unsigned 16-bit values in _MM2 from the four unsigned 16-bit
	//values in _MM1 and saturates
	__m64 _m_psubusw(__m64 _MM1, __m64 _MM2);//_mm_subs_pu16
	//Multiplies four 16-bit values in _MM1 by four 16-bit values in _MM2 to produce
	//four 32-bit intermediate results, which are then summed by pairs to produce two
	//32-bit results,r0=_MM10*_MM20+_MM11*_MM21, r1=_MM12*_MM22+_MM13*_MM23
	__m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2);//_mm_madd_pi16
	//Multiplies four signed 16-bit values in _MM1 by four signed 16-bit values in _MM2
	//and produces the high 16 bits of the four results
	__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2);//_mm_mulhi_pi16
	//Multiplies four 16-bit values in _MM1 by four 16-bit values in _MM2 and produces
	//the low 16 bits of the four results
	__m64 _m_pmullw(__m64 _MM1, __m64 _MM2);//_mm_mullo_pi16

	/* Shift intrinsics */
	//Shifts four 16-bit values in _M left the amount specified by _Count 
	//while shifting in zeros,左移_Count位，移出位补0
	__m64 _m_psllw(__m64 _M, __m64 _Count);//_mm_sll_pi16
	//Shifts four 16-bit values in _M left the amount specified by _Ccount while 
	//shifting in zeros,左移_Count位，移出位补0,_Count需是一个立即数
	//汇编语言中的立即数相当于高级语言中的常量(常数)，它是直接出现在指令中的数，
	//不用存储在寄存器或存储器中的数
	__m64 _m_psllwi(__m64 _M, int _Count);//_mm_slli_pi16 
	//Shifts two 32-bit values in _M left the amount specified by _Count
	//while shifting in zeros
	__m64 _m_pslld(__m64 _M, __m64 _Count);//_mm_sll_pi32
	//Shifts two 32-bit values in _M left the amount specified by _Count
	//while shifting in zeros
	__m64 _m_pslldi(__m64 _M, int _Count);//_mm_slli_pi32
	//Shifts the 64-bit value in _M left the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psllq(__m64 _M, __m64 _Count);//_mm_sll_si64
	//Shifts the 64-bit value in _M left the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psllqi(__m64 _M, int _Count);//_mm_slli_si64
	//Shifts four 16-bit values in _M right the amount specified by _Count
	//while shifting in the sign bit
	__m64 _m_psraw(__m64 _M, __m64 _Count);//_mm_sra_pi16
	//Shifts four 16-bit values in _M right the amount specified by _Count
	//while shifting in the sign bit
	__m64 _m_psrawi(__m64 _M, int _Count);//_mm_srai_pi16
	//Shifts two 32-bit values in _M right the amount specified by _Count
	//while shifting in the sign bit
	__m64 _m_psrad(__m64 _M, __m64 _Count);//_mm_sra_pi32
	//Shifts two 32-bit values in _M right the amount specified by _Count
	//while shifting in the sign bit
	__m64 _m_psradi(__m64 _M, int _Count);//_mm_srai_pi32
	//Shifts four 16-bit values in _M right the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psrlw(__m64 _M, __m64 _Count);//_mm_srl_pi16
	//Shifts four 16-bit values in _M right the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psrlwi(__m64 _M, int _Count);//_mm_srli_pi16
	//Shifts two 32-bit values in _M right the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psrld(__m64 _M, __m64 _Count);//_mm_srl_pi32
	//Shifts two 32-bit values in _M right the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psrldi(__m64 _M, int _Count);//_mm_srli_pi32 
	//Shifts the 64-bit value in _M right the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psrlq(__m64 _M, __m64 _Count);//_mm_srl_si64
	//Shifts the 64-bit value in _M right the amount specified by _Count
	//while shifting in zeros
	__m64 _m_psrlqi(__m64 _M, int _Count);//_mm_srli_si64
	
	/* Logical intrinsics */
	//Performs a bitwise AND of the 64-bit value in _MM1 with the 64-bit value in _MM2
	__m64 _m_pand(__m64 _MM1, __m64 _MM2);//_mm_and_si64
	//Performs a logical NOT on the 64-bit value in _MM1 and use the result in a 
	//bitwise AND with the 64-bit value in _MM2
	__m64 _m_pandn(__m64 _MM1, __m64 _MM2);//_mm_andnot_si64
	//Performs a bitwise OR of the 64-bit value in _MM1 with the 64-bit value in _MM2
	__m64 _m_por(__m64 _MM1, __m64 _MM2);//_mm_or_si64
	//Performs a bitwise XOR of the 64-bit value in _MM1 with the 64-bit value in _MM2
	__m64 _m_pxor(__m64 _MM1, __m64 _MM2);//_mm_xor_si64

	/* Comparison intrinsics */
	//If the respective 8-bit values in _MM1 are equal to the respective 
	//8-bit values in _MM2, sets the respective 8-bit resulting values to 
	//all ones; otherwise, sets them to all zeros
	__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi8
	//If the respective 16-bit values in _MM1 are equal to the respective 
	//16-bit values in _MM2, sets the respective 16-bit resulting values 
	//to all ones; otherwise, sets them to all zeros
	__m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi16
	//If the respective 32-bit values in _MM1 are equal to the respective 
	//32-bit values in _MM2, sets the respective 32-bit resulting values
	//to all ones; otherwise, sets them to all zeros
	__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi32 
	//If the respective 8-bit values in _MM1 are greater than the respective 
	//8-bit values in _MM2, sets the respective 8-bit resulting values to all ones;
	//otherwise, sets them to all zeros
	__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi8
	//If the respective 16-bit values in _MM1 are greater than the respective 16-bit
	//values in _MM2, sets the respective 16-bit resulting values to all ones;
	//otherwise, sets them to all zeros
	__m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi16
	//If the respective 32-bit values in _MM1 are greater than the respective 32-bit
	//values in _MM2, sets the respective 32-bit resulting values to all ones;
	//otherwise, sets them all to zeros
	__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi32

	/* Utility intrinsics */
	//Sets the 64-bit value to zero
	__m64 _mm_setzero_si64(void);
	//Sets the two signed 32-bit integer values,r0=_I0, r1=_I1
	__m64 _mm_set_pi32(int _I1, int _I0);
	//r0=_S0, r1=_S1, r2=_S2, r3=_S3
	__m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0);
	//r0=_B0, r1=_B1, r2=_B2, r3=_B3, r4=_B4, ..., r7=_B7
	__m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4,
						char _B3, char _B2, char _B1, char _B0);
	//Sets the two signed 32-bit integer values to _I,r0=r1=_I
	__m64 _mm_set1_pi32(int _I);
	//Sets the four signed 16-bit integer values to _S, r0=r1=r2=r3=_S
	__m64 _mm_set1_pi16(short _S);
	//Sets the eight signed 8-bit integer values to _B, r0=r1...=r7=_B
	__m64 _mm_set1_pi8(char _B);
	//Sets the two signed 32-bit integer values in reverse order,r0=_I1, r1=_I0
	__m64 _mm_setr_pi32(int _I1, int _I0);
	//Sets the four signed 16-bit integer values in reverse order,
	//r0=_S3, r1=_S2, r2=_S1, r3=_S0
	__m64 _mm_setr_pi16(short _S3, short _S2, short _S1, short _S0);
	//Sets the eight signed 8-bit integer values in reverse order
	//r0=_B7, r1=_B6, r2=_B5, r3=_B4, r4=_B3, r5=_B2, r6=_B1, r7=_B0
	__m64 _mm_setr_pi8(char _B7, char _B6, char _B5, char _B4,
						char _B3, char _B2, char _B1, char _B0);

	/* Alternate intrinsic name definitions */
	#define _mm_empty         _m_empty
	#define _mm_cvtsi32_si64  _m_from_int
	#define _mm_cvtsi64_si32  _m_to_int
	#define _mm_packs_pi16    _m_packsswb
	#define _mm_packs_pi32    _m_packssdw
	#define _mm_packs_pu16    _m_packuswb
	#define _mm_unpackhi_pi8  _m_punpckhbw
	#define _mm_unpackhi_pi16 _m_punpckhwd
	#define _mm_unpackhi_pi32 _m_punpckhdq
	#define _mm_unpacklo_pi8  _m_punpcklbw
	#define _mm_unpacklo_pi16 _m_punpcklwd
	#define _mm_unpacklo_pi32 _m_punpckldq
	#define _mm_add_pi8       _m_paddb
	#define _mm_add_pi16      _m_paddw
	#define _mm_add_pi32      _m_paddd
	#define _mm_adds_pi8      _m_paddsb
	#define _mm_adds_pi16     _m_paddsw
	#define _mm_adds_pu8      _m_paddusb
	#define _mm_adds_pu16     _m_paddusw
	#define _mm_sub_pi8       _m_psubb
	#define _mm_sub_pi16      _m_psubw
	#define _mm_sub_pi32      _m_psubd
	#define _mm_subs_pi8      _m_psubsb
	#define _mm_subs_pi16     _m_psubsw
	#define _mm_subs_pu8      _m_psubusb
	#define _mm_subs_pu16     _m_psubusw
	#define _mm_madd_pi16     _m_pmaddwd
	#define _mm_mulhi_pi16    _m_pmulhw
	#define _mm_mullo_pi16    _m_pmullw
	#define _mm_sll_pi16      _m_psllw
	#define _mm_slli_pi16     _m_psllwi
	#define _mm_sll_pi32      _m_pslld
	#define _mm_slli_pi32     _m_pslldi
	#define _mm_sll_si64      _m_psllq
	#define _mm_slli_si64     _m_psllqi
	#define _mm_sra_pi16      _m_psraw
	#define _mm_srai_pi16     _m_psrawi
	#define _mm_sra_pi32      _m_psrad
	#define _mm_srai_pi32     _m_psradi
	#define _mm_srl_pi16      _m_psrlw
	#define _mm_srli_pi16     _m_psrlwi
	#define _mm_srl_pi32      _m_psrld
	#define _mm_srli_pi32     _m_psrldi
	#define _mm_srl_si64      _m_psrlq
	#define _mm_srli_si64     _m_psrlqi
	#define _mm_and_si64      _m_pand
	#define _mm_andnot_si64   _m_pandn
	#define _mm_or_si64       _m_por
	#define _mm_xor_si64      _m_pxor
	#define _mm_cmpeq_pi8     _m_pcmpeqb
	#define _mm_cmpeq_pi16    _m_pcmpeqw
	#define _mm_cmpeq_pi32    _m_pcmpeqd
	#define _mm_cmpgt_pi8     _m_pcmpgtb
	#define _mm_cmpgt_pi16    _m_pcmpgtw
	#define _mm_cmpgt_pi32    _m_pcmpgtd

参考文献：1、http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/intref_cls/common/intref_mmx_emms_usage.htm