SIMD是Single Instruction,Multiple Data的缩写——意为单指令多数据,是inter开发的多媒体指令集,采用C++封装接口,底层调用汇编语言,因此执行效率很高。适合重复且可以并行的计算场合。
由于编码器在计算SAD操作需要反复读写数据,并且适合并行执行,因此JEM在计算SAD的时候使用了单指令多数据的优化(SIMD),以16*16块的SAD计算为例:
UInt GetSAD16x16_SSE_U16(I16 **pSrc, I16 *pRef, Int iRefStride, Int iYOffset, Int iXOffset, UInt uiBestSAD)
{
int blkSize = 16;
__m128i s0, s1;
__m128i r0, r1;
__m128i diff, diff0, diff1;
UInt sumOfRow;
UInt sum = 0;
for (int blkRow = 0; blkRow < 2; blkRow++)
{
for (int blkCol = 0; blkCol < 2; blkCol++)
{
Int iRelativeOffset = ((blkRow *iRefStride + blkCol) << 3);
I16 *pRefCurr = pRef + iRelativeOffset;
Int iYOffsetCurr = iYOffset + (blkRow << 3);
Int iXOffsetCurr = iXOffset + (blkCol << 3);
for (int row = 0; row < 8; row += 2)
{
r0 = _mm_loadu_si128((__m128i*) (pRefCurr + iRefStride*row));
s0 = _mm_loadu_si128((__m128i*) (pSrc[iYOffsetCurr + row] + iXOffsetCurr));
diff0 = _mm_subs_epi16(r0, s0);
diff0 = _mm_abs_epi16(diff0);
r1 = _mm_loadu_si128((__m128i*) (pRefCurr + iRefStride*(1 + row)));
s1 = _mm_loadu_si128((__m128i*) (pSrc[iYOffsetCurr + row + 1] + iXOffsetCurr));
diff1 = _mm_subs_epi16(r1, s1);
diff1 = _mm_abs_epi16(diff1);
diff = _mm_adds_epi16(diff0, diff1);
#if USE_SUM
sumOfRow = GetSum(diff);
#else
sumOfRow = diff.m128i_i16[0] + diff.m128i_i16[1] + diff.m128i_i16[2] + diff.m128i_i16[3] + diff.m128i_i16[4] + diff.m128i_i16[5] + diff.m128i_i16[6] + diff.m128i_i16[7];
#endif
sum += sumOfRow;
if (sum > uiBestSAD)
{
return sum;
}
}
}
}
return sum;
}
__m128i一次性可以读取16个字节的数据处理,相比原先一次性读取2个字节的处理方式,可以明显的减少反复读写以及加减操作的次数,因此可以明显提高编码速度。
基于SIMD的编码器优化是一个很重要的工作方向!!!