权值更新就是使用SSE指令集加速完成NLMS算法的频域计算,具体实现细节见代码注释。
static void FilterAdaptationSSE2(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
int i, j;
const int num_partitions = aec->num_partitions; // 分块数
for (i = 0; i < num_partitions; i++) {
// xpos指向分块的第一个元素,每块65个元素
int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
int pos = i * PART_LEN1;
// 边界检查,如果分块位置超过块数,回到缓存的第一块的起始地址
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// Process the whole array...
for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf(远场数据的频域表达) and ef(消回声估计误差的频域表达).
/*_mm_load_ps它的输入是一个指向float的指针,返回的就是一个__m128类型的数据,从函数的角度理解,
就是把一个float数组的四个元素依次读取,返回一个组合的__m128类型的SSE数据类型,
从而可以使用这个返回的结果传递给其它的SSE指令进行运算,比如加法等;从汇编的角度理解,
它对应的就是读取内存中连续四个地址的float数据,将其放入SSE新的暂存器(XMM0~8)中,
从而给其他的指令准备好数据进行计算,_ps表示并行。*/
// 加载数据
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
// Calculate the product of conjugate(xfBuf) by ef.
// re(conjugate(a) * b) = aRe * bRe + aIm * bIm 实部
// im(conjugate(a) * b)= aRe * bIm - aIm * bRe 虚部
/*__m128 _mm_mul_ps(__m128 a , __m128 b )将a和b的四个单精度浮点值相乘。
返回值r0 := a0 * b0;r1 := a1 * b1;r2 := a2 * b2;r3 := a3 * b3*/
const __m128 a = _mm_mul_ps(xfBuf_re, ef_re); // 远场数据实部与误差实部相乘
const __m128 b = _mm_mul_ps(xfBuf_im, ef_im); // 远场数据虚部与误差虚部相乘
const __m128 c = _mm_mul_ps(xfBuf_re, ef_im); // 远场数据实部与误差虚部相乘
const __m128 d = _mm_mul_ps(xfBuf_im, ef_re); // 远场数据虚部与误差实部相乘
const __m128 e = _mm_add_ps(a, b); // a+b
const __m128 f = _mm_sub_ps(c, d); // c-d
// Interleave real and imaginary parts.
// extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B);
// 返回一个__m128的寄存器,Selects and interleaves the lower two single-precision,
// floating-point values from a and b
// r0=_A0, r1=_B0, r2=_A1, r3=_B1
const __m128 g = _mm_unpacklo_ps(e, f);
// extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B);
// 返回一个__m128的寄存器,Selects and interleaves the upper two single-precision,
// floating-point values from a and b
// r0=_A2, r1=_B2, r2=_A3, r3=_B3
const __m128 h = _mm_unpackhi_ps(e, f);
// Store,extern void _mm_storeu_ps(float *_V, __m128 _A);
//返回为空,Stores four single-precision, floating-point values,
//The address does not need to be 16-byte aligned
//_V[0]=_A0, _V[1]=_A1, _V[2]=_A2, _V[3]=_A3
_mm_storeu_ps(&fft[2 * j + 0], g);
_mm_storeu_ps(&fft[2 * j + 4], h);
}
// ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN],
ef[1][PART_LEN]);
// 对fft进行傅里叶逆变换
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
{
float scale = 2.0f / PART_LEN2;
const __m128 scale_ps = _mm_load_ps1(&scale);
for (j = 0; j < PART_LEN; j += 4) {
const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
_mm_storeu_ps(&fft[j], fft_scale);
}
}
aec_rdft_forward_128(fft);
// 权值更新,权值为频域表达形式
{
float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j += 4) {
__m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
__m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
const __m128 fft_re =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 fft_im =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
_mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
_mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
}
aec->wfBuf[1][pos] = wt1;
}
}
}