IPP软件移植实践(二)
用到的neon函数介绍(可参考上一篇文章http://t.csdn.cn/atmXa了解更多移植实践函数,具体的neon函数和变量命名规则,可参考我的这一篇文章 http://t.csdn.cn/W6vfR)
uint32x4_t vcgtq_f32 (float32x4_t a, float32x4_t b)
比较a向量中的值是否大于b向量对应通道中的值,从而返回一个比较结果向量值
int32x4_t vcvtq_s32_f32 (float32x4_t a)
类型转换函数,将Ipp32f类型数据转换为Ipp32s类型数据
float32x4_t vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
向量乘减操作,返回给向量寄存器r = a - b * c值
int16x4_t vmovn_s32 (int32x4_t a)
数据高位移操作,将Ipp32s数移位成为低精度位Ipp16s数据
利用vrecpeq_f32函数以及vrecpsq_f32函数求向量v各通道值的高精度倒数,也即求得1/v
rec = vrecpeq_f32(v);
recip1 = vmulq_f32(vrecpsq_f32(v, rec), rec);
recip2 = vmulq_f32(vrecpsq_f32(v, recip1), recip1);
根据exp、sin的泰勒展开式求得exp和sin的精确值
#define eps 1e-7
void getExp(const float32_t *exp, int len, float32_t *exp_new)//exp泰勒展开式的串行实现
{
float32_t x, y, e, term, temp;
int i, j;
for (i = 0; i < len; i++)
{
y = 2.0;
x = exp[i];
term = x;
e = x + 1.0;
for (j = 0; fabs(term) >= eps; j++)
{
temp = term * x;
term = temp * (1 / y);
e = e + term;
y++;
}
exp_new[i] = e;
}
}
IppStatus ippsExp_32f_A24(const Ipp32f *pSrc, Ipp32f *pDst, Ipp32s len)
{
if (pSrc == NULL || pDst == NULL)
return ippStsNullPtrErr;
else if (len <= 0)
return ippStsSizeErr;
float32x4_t vec_temp1, vec_temp2, vec_term, rec, recip1,recip2, vec_lx, vec_exp;
float32_t y, exp_new[4] = {0.0};
int i, j;
if (len >= 4)
{
#pragma omp parallel for
for (i = 0; i < len / 4 * 4; i += 4)
{
y = 2.0;
vec_lx = vld1q_f32(&pSrc[i]);
vec_exp = vaddq_f32(vec_lx, vdupq_n_f32(1.0));
vec_term = vld1q_f32(&pSrc[i]);
for (j = 0; fabs(vmaxvq_f32(vec_term)) > eps; j++)//利用eps值控制向量vec_term的精度,确保exp求值的循环过程随精度而变化,而不是一成不变
{
vec_temp1 = vdupq_n_f32(y);
rec = vrecpeq_f32(vec_temp1);
recip1 = vmulq_f32(vrecpsq_f32(vec_temp1, rec), rec);
recip2 = vmulq_f32(vrecpsq_f32(vec_temp1, recip1), recip1);//求得倒数1/y
vec_temp1 = vmulq_f32(vec_term, vec_lx);
vec_term = vmulq_f32(vec_temp1, recip2);
vec_exp = vaddq_f32(vec_exp, vec_term);
y++;
}
vst1q_f32(&pDst[i], vec_exp);
}
}
else
{
#pragma omp parallel for
for (i = 0; i < len; i++)
{
getExp(&pSrc[i], 1, exp_new);
pDst[i] = exp_new[0];
}
return ippStsNoErr;
}
#pragma omp parallel for
for (i = len / 4 * 4; i < len; i++)
{
getExp(&pSrc[i], 1, exp_new);
pDst[i] = exp_new[0];
}
return ippStsNoErr;
}
void getSin_32f(const float32_t sin, float32_t *sin_new)//sin泰勒展开式的串行实现
{
float32_t res, term, z;
int i, lk;
int32_t k;
float32_t x = sin;
k = x / (2 * IPP_PI);
x = x - 2 * k * IPP_PI;
term = x;
res = x;
lk = 1;
z = x * x;
do
{
lk = lk + 2;
term = -term * z / (lk * (lk - 1));
res = res + term;
} while (fabs(term) > eps);
*sin_new = res;
}
IppStatus ippsSin_32f_A24(const Ipp32f *pSrc, Ipp32f *pDst, Ipp32s len)
{
if (pSrc == NULL || pDst == NULL)
return ippStsNullPtrErr;
else if (len <= 0)
return ippStsSizeErr;
float32x4_t vec_term, vec_res, vec_lz, vec_lk, vec_top, rec, recip1, recip2, vec_temp;
uint32x4_t vec_cmp;
float32x4_t vec_pi = vdupq_n_f32(2 * IPP_PI), vec_prec = vdupq_n_f32(eps), vec_nege = vdupq_n_f32(-1.0);
int32x4_t vec_k;
float32_t sin_new;
uint32_t flag;
int i;
if (len >= 4)
{
#pragma omp parallel for
for (i = 0; i < len / 4 * 4; i += 4)
{
vec_term = vld1q_f32(&pSrc[i]);
rec = vrecpeq_f32(vec_pi);
recip1 = vmulq_f32(vrecpsq_f32(vec_pi, rec), rec);
recip2 = vmulq_f32(vrecpsq_f32(vec_pi, recip1), recip1);
vec_k = vcvtq_s32_f32(vmulq_f32(vec_term, recip2));
vec_term = vmlsq_f32(vec_term, vcvtq_f32_s32(vec_k), vec_pi);
vec_res = vec_term;
vec_lz = vmulq_f32(vec_term, vec_term);
vec_lk = vdupq_n_f32(1.0);
do
{
vec_lk = vaddq_f32(vec_lk, vdupq_n_f32(2.0));
vec_top = vmulq_f32(vmulq_f32(vec_term, vec_nege), vec_lz);
vec_temp = vmulq_f32(vec_lk, vaddq_f32(vec_lk, vec_nege));
rec = vrecpeq_f32(vec_temp);
recip1 = vmulq_f32(vrecpsq_f32(vec_temp, rec), rec);
recip2 = vmulq_f32(vrecpsq_f32(vec_temp, recip1), recip1);
vec_term = vmulq_f32(vec_top, recip2);
vec_res = vaddq_f32(vec_res, vec_term);
vec_cmp = vcgtq_f32(vabsq_f32(vec_term), vec_prec);//比较过程值精度大小,用于控制do...while循环,当精度到达标准时结束循环
flag = vmaxvq_u32(vec_cmp);
} while (flag != 0);
vst1q_f32(&pDst[i], vec_res);
}
}
else
{
#pragma omp parallel for
for (i = 0; i < len; i++)
{
getSin_32f(pSrc[i], &sin_new);
pDst[i] = sin_new;
}
return ippStsNoErr;
}
#pragma omp parallel for
for (i = len / 4 * 4; i < len; i++)
{
getSin_32f(pSrc[i], &sin_new);
pDst[i] = sin_new;
}
return ippStsNoErr;
}
采样生成函数之 Tone函数
tone是数字信号的基本构件。tone生成的过程,即是通过给定频率、初相位和幅度的正弦波,来描述tone函数。
参数说明
pDst: 记录tone采样点的buffer
len: 计算的tone采样点个数。即x[n]中n取值范围为:0,1,2,……,len-2,len-1
magn: 振幅,信号波的最大值
rFreq: 采样频率。对于实数tone的时间间隔[0.0, 0.5),对于复数tone的间隔[0.0, 1.0)
pPhase: 初相位,取值范围[0.0, 2π)
hint: ippAlgHintFast:内核使用低精度向量计算。ippAlgHintAccurate:内核使用高精度向量计算
实数tone信号公式
x[n] = magn * cos(2πnrFreq + phase)
复数tone信号公式
x[n] = magn * (cos(2πnrFreq + phase)+j* sin(2πn*rFreq + phase))
//采样函数实数Tone采样信号实现
IppStatus ippsTone_16s(Ipp16s *pDst, int len, Ipp16s magn, Ipp32f rFreq, Ipp32f *pPhase, IppHintAlgorithm hint)
{
if (pDst == NULL || pPhase == NULL)
return ippStsNullPtrErr;
else if (len <= 0)
return ippStsSizeErr;
else if (magn <= 0)
return ippStsToneMagnErr;
else if (rFreq < 0 || rFreq >= 0.5)
return ippStsToneFreqErr;
float32x4_t vec_temp1, vec_temp2, vec_phase, rec1, rec2, recip1, recip2, vec_term,
vec_top, vec_cos, vec_x, vec_lk, vec_z, vec_index, vec_magn;
vec_magn = vdupq_n_f32((Ipp32f)magn);
float32x4_t vec_rfeq = vdupq_n_f32(rFreq);
float32x4_t vec_pi = vdupq_n_f32(2.0 * IPP_PI);
float32x4_t vec_nege = vdupq_n_f32(-1.0);
float32x4_t vec_prec = vdupq_n_f32(eps);
float32_t index[len];
Ipp16s cos_new;
int i;
for (i = 0; i < len; i++)
{
index[i] = i;
}
uint32_t flag = 1;
uint32x4_t vec_cmp;
int32x4_t vec_k, vec_cosi;
int16x4_t vec_xn, vec_cosi2;
if (len >= 4)
{
for (i = 0; i < len / 4 * 4; i += 4)
{
//实数Tone信号采样实现,就是针对cos的x值以及cos值进行适当放大和频率放大操作
vec_phase = vld1q_f32(&pPhase[i]);
vec_index = vld1q_f32(&index[i]);
vec_temp1 = vmulq_f32(vec_pi, vec_index);
vec_x = vmlaq_f32(vec_phase, vec_temp1, vec_rfeq);
rec1 = vrecpeq_f32(vec_pi);
recip1 = vmulq_f32(vrecpsq_f32(vec_pi, rec1), rec1);
recip2 = vmulq_f32(vrecpsq_f32(vec_pi, recip1), recip1);
vec_k = vcvtq_s32_f32(vmulq_f32(vec_x, recip2));//对x值进行放缩,求得x = zkΠ+α中的k值,从而将大值x化简为小值α
vec_x = vmlsq_f32(vec_x, vcvtq_f32_s32(vec_k), vec_pi);//vmlsq_f32函数乘减操作,将大值x化简为小值α
vec_term = vdupq_n_f32(1.0);
vec_cos = vec_term;
vec_z = vmulq_f32(vec_x, vec_x);
vec_lk = vdupq_n_f32(0.0);
do
{
vec_lk = vaddq_f32(vec_lk, vdupq_n_f32(2.0));
vec_top = vmulq_f32(vmulq_f32(vec_term, vec_nege), vec_z);
vec_temp2 = vmulq_f32(vec_lk, vaddq_f32(vec_lk, vec_nege));
rec2 = vrecpeq_f32(vec_temp2);
recip1 = vmulq_f32(vrecpsq_f32(vec_temp2, rec2), rec2);
recip2 = vmulq_f32(vrecpsq_f32(vec_temp2, recip1), recip1);
vec_term = vmulq_f32(vec_top, recip2);
vec_cos = vaddq_f32(vec_cos, vec_term);
vec_cmp = vcgtq_f32(vabsq_f32(vec_term), vec_prec);
flag = vmaxvq_u32(vec_cmp);
} while (flag != 0);
vec_cos = vmulq_f32(vec_cos, vec_magn);
vec_cosi = vcvtq_s32_f32(vec_cos);
vec_cosi2 = vmovn_s32(vec_cosi);//移位操作,将高精度转为低精度
vec_xn = vec_cosi2;
vst1_s16(&pDst[i], vec_xn);
}
}
else
{
for (i = 0; i < len; i++)
{
Ipp32f cos = 2 * IPP_PI * index[i] * rFreq + pPhase[i];
getCos_tone(cos, &cos_new, magn);
pDst[i] = cos_new;
}
}
for (i = len / 4 * 4; i < len; i++)
{
Ipp32f cos = 2 * IPP_PI * index[i] * rFreq + pPhase[i];
getCos_tone(cos, &cos_new, magn);
pDst[i] = cos_new;
}
return ippStsNoErr;
}
感兴趣的朋友可以尝试实现cos值以及Tone复数信号采样的simd化
IppStatus ippsCos_32f_A24(const Ipp32f *pSrc, Ipp32f *pDst, Ipp32s len)
IppStatus ippsTone_16sc(Ipp16sc *pDst, int len, Ipp16s magn, Ipp32f rFreq, Ipp32f *pPhase, IppHintAlgorithm hint)