高性能计算学习笔记（四）

Rookie_whd

已于 2022-11-09 12:36:57 修改

阅读量760

点赞数

文章标签：学习 linux 性能优化

于 2022-11-09 12:35:09 首次发布

本文链接：https://blog.csdn.net/weixin_47034794/article/details/127767223

版权

本文详细介绍了在IPP软件移植中使用的Neon函数，如vcgtq_f32、vcvtq_s32_f32、vmlsq_f32等，并展示了如何通过泰勒展开式计算exp和sin的高精度值。着重讲解了simd化实数Tone信号采样和cos值计算的优化技术。

摘要由CSDN通过智能技术生成

IPP软件移植实践（二）

用到的neon函数介绍（可参考上一篇文章http://t.csdn.cn/atmXa了解更多移植实践函数，具体的neon函数和变量命名规则，可参考我的这一篇文章 http://t.csdn.cn/W6vfR）

uint32x4_t vcgtq_f32 (float32x4_t a, float32x4_t b)
比较a向量中的值是否大于b向量对应通道中的值，从而返回一个比较结果向量值

int32x4_t vcvtq_s32_f32 (float32x4_t a)
类型转换函数，将Ipp32f类型数据转换为Ipp32s类型数据

float32x4_t vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
向量乘减操作，返回给向量寄存器r = a - b * c值

int16x4_t vmovn_s32 (int32x4_t a)
数据高位移操作，将Ipp32s数移位成为低精度位Ipp16s数据

利用vrecpeq_f32函数以及vrecpsq_f32函数求向量v各通道值的高精度倒数，也即求得1/v

rec = vrecpeq_f32(v);
recip1 = vmulq_f32(vrecpsq_f32(v, rec), rec);
recip2 = vmulq_f32(vrecpsq_f32(v, recip1), recip1);

根据exp、sin的泰勒展开式求得exp和sin的精确值

在这里插入图片描述

#define eps 1e-7
void getExp(const float32_t *exp, int len, float32_t *exp_new)//exp泰勒展开式的串行实现
{
	float32_t x, y, e, term, temp;
	int i, j;

	for (i = 0; i < len; i++)
	{
		y = 2.0;
		x = exp[i];
		term = x;
		e = x + 1.0;
		for (j = 0; fabs(term) >= eps; j++)
		{
			temp = term * x;
			term = temp * (1 / y);
			e = e + term;
			y++;
		}
		exp_new[i] = e;
	}
}

IppStatus ippsExp_32f_A24(const Ipp32f *pSrc, Ipp32f *pDst, Ipp32s len)
{
	if (pSrc == NULL || pDst == NULL)
		return ippStsNullPtrErr;
	else if (len <= 0)
		return ippStsSizeErr;
	float32x4_t vec_temp1, vec_temp2, vec_term, rec, recip1,recip2, vec_lx, vec_exp;
	float32_t y, exp_new[4] = {0.0};
	int i, j;
	if (len >= 4)
	{
#pragma omp parallel for
		for (i = 0; i < len / 4 * 4; i += 4)
		{
			y = 2.0;
			vec_lx = vld1q_f32(&pSrc[i]);
			vec_exp = vaddq_f32(vec_lx, vdupq_n_f32(1.0));
			vec_term = vld1q_f32(&pSrc[i]);
			for (j = 0; fabs(vmaxvq_f32(vec_term)) > eps; j++)//利用eps值控制向量vec_term的精度，确保exp求值的循环过程随精度而变化，而不是一成不变
			{
				vec_temp1 = vdupq_n_f32(y);
				rec = vrecpeq_f32(vec_temp1);
				recip1 = vmulq_f32(vrecpsq_f32(vec_temp1, rec), rec);
				recip2 = vmulq_f32(vrecpsq_f32(vec_temp1, recip1), recip1);//求得倒数1/y
				vec_temp1 = vmulq_f32(vec_term, vec_lx);
				vec_term = vmulq_f32(vec_temp1, recip2);
				vec_exp = vaddq_f32(vec_exp, vec_term);
				y++;
			}
			vst1q_f32(&pDst[i], vec_exp);
		}
	}
	else
	{
#pragma omp parallel for
		for (i = 0; i < len; i++)
		{
			getExp(&pSrc[i], 1, exp_new);
			pDst[i] = exp_new[0];
		}
		return ippStsNoErr;
	}
#pragma omp parallel for
	for (i = len / 4 * 4; i < len; i++)
	{
		getExp(&pSrc[i], 1, exp_new);
		pDst[i] = exp_new[0];
	}
	return ippStsNoErr;
}

void getSin_32f(const float32_t sin, float32_t *sin_new)//sin泰勒展开式的串行实现
{
	float32_t res, term, z;
	int i, lk;
	int32_t k;
	float32_t x = sin;
	k = x / (2 * IPP_PI);
	x = x - 2 * k * IPP_PI;
	term = x;
	res = x;
	lk = 1;
	z = x * x;
	do
	{
		lk = lk + 2;
		term = -term * z / (lk * (lk - 1));
		res = res + term;
	} while (fabs(term) > eps);
	*sin_new = res;
}

IppStatus ippsSin_32f_A24(const Ipp32f *pSrc, Ipp32f *pDst, Ipp32s len)
{
	if (pSrc == NULL || pDst == NULL)
		return ippStsNullPtrErr;
	else if (len <= 0)
		return ippStsSizeErr;
	float32x4_t vec_term, vec_res, vec_lz, vec_lk, vec_top, rec, recip1, recip2, vec_temp;
	uint32x4_t vec_cmp;
	float32x4_t vec_pi = vdupq_n_f32(2 * IPP_PI), vec_prec = vdupq_n_f32(eps), vec_nege = vdupq_n_f32(-1.0);
	int32x4_t vec_k;
	float32_t sin_new;
	uint32_t flag;
	int i;
	if (len >= 4)
	{
#pragma omp parallel for
		for (i = 0; i < len / 4 * 4; i += 4)
		{
			vec_term = vld1q_f32(&pSrc[i]);
			rec = vrecpeq_f32(vec_pi);
			recip1 = vmulq_f32(vrecpsq_f32(vec_pi, rec), rec);
			recip2 = vmulq_f32(vrecpsq_f32(vec_pi, recip1), recip1);
			vec_k = vcvtq_s32_f32(vmulq_f32(vec_term, recip2));
			vec_term = vmlsq_f32(vec_term, vcvtq_f32_s32(vec_k), vec_pi);
			vec_res = vec_term;
			vec_lz = vmulq_f32(vec_term, vec_term);
			vec_lk = vdupq_n_f32(1.0);
			do
			{
				vec_lk = vaddq_f32(vec_lk, vdupq_n_f32(2.0));
				vec_top = vmulq_f32(vmulq_f32(vec_term, vec_nege), vec_lz);
				vec_temp = vmulq_f32(vec_lk, vaddq_f32(vec_lk, vec_nege));
				rec = vrecpeq_f32(vec_temp);
				recip1 = vmulq_f32(vrecpsq_f32(vec_temp, rec), rec);
				recip2 = vmulq_f32(vrecpsq_f32(vec_temp, recip1), recip1);
				vec_term = vmulq_f32(vec_top, recip2);
				vec_res = vaddq_f32(vec_res, vec_term);
				vec_cmp = vcgtq_f32(vabsq_f32(vec_term), vec_prec);//比较过程值精度大小，用于控制do...while循环，当精度到达标准时结束循环
				flag = vmaxvq_u32(vec_cmp);
			} while (flag != 0);
			vst1q_f32(&pDst[i], vec_res);
		}
	}
	else
	{
#pragma omp parallel for
		for (i = 0; i < len; i++)
		{
			getSin_32f(pSrc[i], &sin_new);
			pDst[i] = sin_new;
		}
		return ippStsNoErr;
	}
#pragma omp parallel for
	for (i = len / 4 * 4; i < len; i++)
	{
		getSin_32f(pSrc[i], &sin_new);
		pDst[i] = sin_new;
	}
	return ippStsNoErr;
}

采样生成函数之 Tone函数
tone是数字信号的基本构件。tone生成的过程，即是通过给定频率、初相位和幅度的正弦波，来描述tone函数。
参数说明
pDst: 记录tone采样点的buffer
len: 计算的tone采样点个数。即x[n]中n取值范围为：0,1,2,……,len-2,len-1
magn: 振幅，信号波的最大值
rFreq: 采样频率。对于实数tone的时间间隔[0.0, 0.5)，对于复数tone的间隔[0.0, 1.0)
pPhase: 初相位，取值范围[0.0, 2π)
hint: ippAlgHintFast：内核使用低精度向量计算。ippAlgHintAccurate:内核使用高精度向量计算
实数tone信号公式
x[n] = magn * cos(2πnrFreq + phase)
复数tone信号公式
x[n] = magn * (cos(2πnrFreq + phase)+j* sin(2πn*rFreq + phase))

//采样函数实数Tone采样信号实现
IppStatus ippsTone_16s(Ipp16s *pDst, int len, Ipp16s magn, Ipp32f rFreq, Ipp32f *pPhase, IppHintAlgorithm hint)
{
	if (pDst == NULL || pPhase == NULL)
		return ippStsNullPtrErr;
	else if (len <= 0)
		return ippStsSizeErr;
	else if (magn <= 0)
		return ippStsToneMagnErr;
	else if (rFreq < 0 || rFreq >= 0.5)
		return ippStsToneFreqErr;

	float32x4_t vec_temp1, vec_temp2, vec_phase, rec1, rec2, recip1, recip2, vec_term,
		vec_top, vec_cos, vec_x, vec_lk, vec_z, vec_index, vec_magn;
	vec_magn = vdupq_n_f32((Ipp32f)magn);
	float32x4_t vec_rfeq = vdupq_n_f32(rFreq);
	float32x4_t vec_pi = vdupq_n_f32(2.0 * IPP_PI);
	float32x4_t vec_nege = vdupq_n_f32(-1.0);
	float32x4_t vec_prec = vdupq_n_f32(eps);
	float32_t index[len];
	Ipp16s cos_new;
	int i;
	for (i = 0; i < len; i++)
	{
		index[i] = i;
	}
	uint32_t flag = 1;
	uint32x4_t vec_cmp;
	int32x4_t vec_k, vec_cosi;
	int16x4_t vec_xn, vec_cosi2;
	if (len >= 4)
	{
		for (i = 0; i < len / 4 * 4; i += 4)
		{
			//实数Tone信号采样实现，就是针对cos的x值以及cos值进行适当放大和频率放大操作
			vec_phase = vld1q_f32(&pPhase[i]);
			vec_index = vld1q_f32(&index[i]);
			vec_temp1 = vmulq_f32(vec_pi, vec_index);
			vec_x = vmlaq_f32(vec_phase, vec_temp1, vec_rfeq);
			rec1 = vrecpeq_f32(vec_pi);
			recip1 = vmulq_f32(vrecpsq_f32(vec_pi, rec1), rec1);
			recip2 = vmulq_f32(vrecpsq_f32(vec_pi, recip1), recip1);
			vec_k = vcvtq_s32_f32(vmulq_f32(vec_x, recip2));//对x值进行放缩，求得x = zkΠ+α中的k值，从而将大值x化简为小值α
			vec_x = vmlsq_f32(vec_x, vcvtq_f32_s32(vec_k), vec_pi);//vmlsq_f32函数乘减操作，将大值x化简为小值α
			vec_term = vdupq_n_f32(1.0);
			vec_cos = vec_term;
			vec_z = vmulq_f32(vec_x, vec_x);
			vec_lk = vdupq_n_f32(0.0);
			do
			{
				vec_lk = vaddq_f32(vec_lk, vdupq_n_f32(2.0));
				vec_top = vmulq_f32(vmulq_f32(vec_term, vec_nege), vec_z);
				vec_temp2 = vmulq_f32(vec_lk, vaddq_f32(vec_lk, vec_nege));
				rec2 = vrecpeq_f32(vec_temp2);
				recip1 = vmulq_f32(vrecpsq_f32(vec_temp2, rec2), rec2);
				recip2 = vmulq_f32(vrecpsq_f32(vec_temp2, recip1), recip1);
				vec_term = vmulq_f32(vec_top, recip2);
				vec_cos = vaddq_f32(vec_cos, vec_term);
				vec_cmp = vcgtq_f32(vabsq_f32(vec_term), vec_prec);
				flag = vmaxvq_u32(vec_cmp);
			} while (flag != 0);
			vec_cos = vmulq_f32(vec_cos, vec_magn);
			vec_cosi = vcvtq_s32_f32(vec_cos);
			vec_cosi2 = vmovn_s32(vec_cosi);//移位操作，将高精度转为低精度
			vec_xn = vec_cosi2;

			vst1_s16(&pDst[i], vec_xn);
		}
	}
	else
	{
		for (i = 0; i < len; i++)
		{
			Ipp32f cos = 2 * IPP_PI * index[i] * rFreq + pPhase[i];
			getCos_tone(cos, &cos_new, magn);
			pDst[i] = cos_new;
		}
	}

	for (i = len / 4 * 4; i < len; i++)
	{
		Ipp32f cos = 2 * IPP_PI * index[i] * rFreq + pPhase[i];
		getCos_tone(cos, &cos_new, magn);
		pDst[i] = cos_new;
	}

	return ippStsNoErr;
}

感兴趣的朋友可以尝试实现cos值以及Tone复数信号采样的simd化
IppStatus ippsCos_32f_A24(const Ipp32f *pSrc, Ipp32f *pDst, Ipp32s len)
IppStatus ippsTone_16sc(Ipp16sc *pDst, int len, Ipp16s magn, Ipp32f rFreq, Ipp32f *pPhase, IppHintAlgorithm hint)