如何用neon实现正弦余弦函数(sin & cos)的优化
基本思想
要想用neon实现对cosf()和sinf()函数的优化,首先需要找到求近似值的方法,然后再将求近似值的函数用neon实现。
1. cosf(x) 和 sinf(x)的近似求解
需要注意的是,这些近似求解方法,都需要将x
限制再[-pi, pi]
范围。可用以下代码实现
float range_neg_pi_to_pi(float x)
{
float y = 2 * pi;
// 求余数,将范围限制在[-2 * pi, 2 * pi]
int quotient = (int)(x / y);
float val = x - (float)quotient * y;
// 将 val限制在[-pi, pi]
if (val > pi)
{
val -= 2 * pi;
}
else if (val < -pi)
{
val += 2 * pi;
}
return val;
}
向量处理,neon代码:
// 向量处理
static inline void range_neg_pi_to_pi_neon_intrinsics(float *x, float *result, float pi_num, int M)
{
int i;
float32x4_t v_2pi = vdupq_n_f32(pi_num * 2);
float32x4_t v_rec_2pi = vdupq_n_f32(1 / (pi_num * 2));
float32x4_t v_pi = vdupq_n_f32(pi_num);
float32x4_t v_neg_pi = vdupq_n_f32(-pi_num);
for (i = 0; i < M - 3; i += 4)
{
float32x4_t v_x = vld1q_f32(x + i);
int32x4_t v_quotient = vcvtq_s32_f32(vmulq_f32(v_x, v_rec_2pi));
float32x4_t v_remainder = vsubq_f32(v_x, vmulq_f32(vcvtq_f32_s32(v_quotient), v_2pi));
uint32x4_t v_mask1 = vcgtq_f32(v_remainder, v_pi);
uint32x4_t v_mask2 = vcltq_f32(v_remainder, v_neg_pi);
float32x4_t v_re1 = vsubq_f32(v_remainder, v_2pi);
float32x4_t v_re2 = vaddq_f32(v_remainder, v_2pi);
v_remainder = vbslq_f32(v_mask1, v_re1, v_remainder);
v_remainder = vbslq_f32(v_mask2, v_re2, v_remainder);
vst1q_f32(result + i, v_remainder);
}
for (; i < M; i++)
{
float y = 2 * pi_num;
int quotient = (int)(x[i] / y);
float val = x[i] - (float)quotient * y;
if (val > pi_num)
{
val -= 2 * pi_num;
}
else if (val < -pi_num)
{
val += 2 * pi_num;
}
result[i] = val;
}
}
// 单个float32x4_t
static inline float32x4_t range_neg_pi_to_pi_x4(float32x4_t v_x, float pi_num)
{
float32x4_t v_2pi = vdupq_n_f32(pi_num * 2);
float32x4_t v_rec_2pi = vdupq_n_f32(1 / (pi_num * 2));
float32x4_t v_pi = vdupq_n_f32(pi_num);
float32x4_t v_neg_pi = vdupq_n_f32(-pi_num);
int32x4_t v_quotient = vcvtq_s32_f32(vmulq_f32(v_x, v_rec_2pi));
float32x4_t v_remainder = vsubq_f32(v_x, vmulq_f32(vcvtq_f32_s32(v_quotient), v_2pi));
uint32x4_t v_mask1 = vcgtq_f32(v_remainder, v_pi);
uint32x4_t v_mask2 = vcltq_f32(v_remainder, v_neg_pi);
float32x4_t v_re1 = vsubq_f32(v_remainder, v_2pi);
float32x4_t v_re2 = vaddq_f32(v_remainder, v_2pi);
v_remainder = vbslq_f32(v_mask1, v_re1, v_remainder);
v_remainder = vbslq_f32(v_mask2, v_re2, v_remainder);
return v_remainder;
}
泰勒级数展开
固定值计算(实际也是泰勒级数展开)
C代码:
// sin
static inline float sinapprox(float val)
{
float val2 = val * val;
return val * (0.9999793767f + val2 * (-0.1666243672f + val2 *
(8.3089787513e-3f + val2 * (-1.9264918228e-4f + val2 *
2.1478401777e-6f))));
}
//cos
static inline float cosapprox(float val) {
float val2 = val*val;
/* Generated in Sollya using:
> f = remez(cos(x)-1, [|x*x, x*x*x*x, x*x*x*x*x*x, x*x*x*x*x*x*x*x|],
[0.000001, pi], 1, 1e-8);
> plot(f-cos(x)+1, [0, pi]);
> f+1
*/
return
1.f + val2 * (-0.4998515820f + val2 * (4.1518035216e-2f + val2 *
(-1.3422947025e-3f + val2 * 1.8929864824e-5f)));
}
参考 approxmath
2. NEON实现
sinf函数
#include <arm_neon.h>
#ifndef pi
#define pi 3.1415926535897932384626433832
#endif
// 方法1 泰勒级数展开法
static inline void neon_sin_approx(float32x4_t input, float32x4_t *sin_result)
{
int iterations = 8;
float32x4_t term = input;
*sin_result = input;
for (int j = 0; j < iterations; j++)
{
term = vmulq_f32(term, vmulq_f32(input, input));
float32x4_t factor = vdupq_n_f32(-1.0f / ((2 * j + 3) * (2 * j + 2)));
term = vmulq_f32(term, factor);
*sin_result = vaddq_f32(*sin_result, term);
}
}
// 方法2 固定值法
static inline void sinapprox_neon_intrinsics(float32x4_t val, float32x4_t *sin_result)
{
float32x4_t num1 = vdupq_n_f32(0.9999793767f);
float32x4_t num2 = vdupq_n_f32(-0.1666243672f);
float32x4_t num3 = vdupq_n_f32(8.3089787513e-3f);
float32x4_t num4 = vdupq_n_f32(-1.9264918228e-4f);
float32x4_t num5 = vdupq_n_f32(2.1478401777e-6f);
float32x4_t val2 = vmulq_f32(val, val);
*sin_result = vmulq_f32(val, vmlaq_f32(num1, val2, vmlaq_f32(num2, val2, vmlaq_f32(num3, val2, vmlaq_f32(num4, val2, num5)))));
}
static inline void vector_sinf_neon_intrinsics(float *input, float *output, int M)
{// 针对向量input求sinf, M为向量的长度
int i;
float32x4_t v_output;
for (i = 0; i < M - 3; i += 4)
{
float32x4_t v_input = vld1q_f32(input + i);
sinapprox_neon_intrinsics(v_input, &v_output); // 可替换求近似值方法neon_sin_approx(v_input, &v_output);
vst1q_f32(output + i, v_output);
}
// 处理剩余元素
for (; i < M; i++)
{
output[i] = sinf(input[i]);
}
}
cosf函数
// 方法1 泰勒级数展开法
static inline void neon_cos_approx(float32x4_t input, float32x4_t *cos_result)
{
int j;
int iterations = 8;
float32x4_t term = vdupq_n_f32(1.0f);
*cos_result = vdupq_n_f32(1.0f);
for (j = 0; j < iterations; j++)
{
term = vmulq_f32(term, vmulq_f32(input, input));
term = vmulq_f32(term, vdupq_n_f32(-1.0f / ((2 * j + 2) * (2 * j + 1))));
*cos_result = vaddq_f32(*cos_result, term);
}
}
// 方法2 固定值法
static inline void cosapprox_neon_intrinsics(float32x4_t val, float32x4_t *cos_result)
{
float32x4_t num1 = vdupq_n_f32(1.f);
float32x4_t num2 = vdupq_n_f32(-0.4998515820f);
float32x4_t num3 = vdupq_n_f32(4.1518035216e-2f);
float32x4_t num4 = vdupq_n_f32(-1.3422947025e-3f);
float32x4_t num5 = vdupq_n_f32(1.8929864824e-5f);
float32x4_t val2 = vmulq_f32(val, val);
*cos_result = vmlaq_f32(num1, val2, vmlaq_f32(num2, val2, vmlaq_f32(num3, val2, vmlaq_f32(num4, val2, num5))));
}
static inline void vector_cosf_neon_intrinsics(float *input, float *output, int M)
{// 针对向量input求sinf, M为向量的长度
int i;
float32x4_t v_output;
for (i = 0; i < M - 3; i += 4)
{
float32x4_t v_input = vld1q_f32(input + i);
cosapprox_neon_intrinsics(v_input, &v_output);// 可替换求近似值方法neon_cos_approx(v_input, &v_output);
vst1q_f32(output + i, v_output);
}
// 处理剩余元素
for (; i < M; i++)
{
output[i] = cosf(input[i]);
}
}
3. 验证
#define pi 3.1415926535897932384626433832
int main {
int frame_length = 512;
float *test_sin1 = malloc(frame_length * sizeof(float));
float *test_sin2 = malloc(frame_length * sizeof(float));
float *t_data_pi = malloc(frame_length * sizeof(float));
float *t_data = malloc(frame_length * sizeof(float));
float temp_t;
for (i = 0; i < frame_length; i++)
{
t_data[i] = 0.85 * 10 * i;
test_sin1[i] = cosf(t_data[i]);
}
range_neg_pi_to_pi_neon_intrinsics(t_data, t_data_pi , pi, frame_length);
vector_sinf_neon_intrinsics(t_data_pi , test_sin2, frame_length);
for (i = 0; i < st->frame_length; i++)
{
printf("test cos: %.10f, %.10f , cos(%.10f)\n", test_sin1[i], test_sin2[i], t_data[i]);
if (abs(test_sin1[i] - test_sin2[i]) > 0.001)
printf("Bad result !!!!!!!!!!!!!!!!!!\n");
}
free(test_sin1);
free(test_sin2);
free(t_data_pi);
free(t_data);
return 0;
}