如何用neon实现正弦余弦函数(sin & cos)的优化

基本思想

要想用neon实现对cosf()和sinf()函数的优化,首先需要找到求近似值的方法,然后再将求近似值的函数用neon实现。

1. cosf(x) 和 sinf(x)的近似求解

需要注意的是,这些近似求解方法,都需要将x限制再[-pi, pi]范围。可用以下代码实现

float range_neg_pi_to_pi(float x)
{
    float y = 2 * pi;
// 求余数,将范围限制在[-2 * pi, 2 * pi]
    int quotient = (int)(x / y);
    float val = x - (float)quotient * y;
// 将 val限制在[-pi, pi]
    if (val > pi)
    {
        val -= 2 * pi;
    }
    else if (val < -pi)
    {
        val += 2 * pi;
    }
    return val;
}

向量处理,neon代码:


// 向量处理
static inline void range_neg_pi_to_pi_neon_intrinsics(float *x, float *result, float pi_num, int M)
{
    int i;
    float32x4_t v_2pi = vdupq_n_f32(pi_num * 2);
    float32x4_t v_rec_2pi = vdupq_n_f32(1 / (pi_num * 2));
    float32x4_t v_pi = vdupq_n_f32(pi_num);
    float32x4_t v_neg_pi = vdupq_n_f32(-pi_num);

    for (i = 0; i < M - 3; i += 4)
    {
        float32x4_t v_x = vld1q_f32(x + i);
        int32x4_t v_quotient = vcvtq_s32_f32(vmulq_f32(v_x, v_rec_2pi));
        float32x4_t v_remainder = vsubq_f32(v_x, vmulq_f32(vcvtq_f32_s32(v_quotient), v_2pi));

        uint32x4_t v_mask1 = vcgtq_f32(v_remainder, v_pi);
        uint32x4_t v_mask2 = vcltq_f32(v_remainder, v_neg_pi);

        float32x4_t v_re1 = vsubq_f32(v_remainder, v_2pi);
        float32x4_t v_re2 = vaddq_f32(v_remainder, v_2pi);
        v_remainder = vbslq_f32(v_mask1, v_re1, v_remainder);
        v_remainder = vbslq_f32(v_mask2, v_re2, v_remainder);

        vst1q_f32(result + i, v_remainder);
    }

    for (; i < M; i++)
    {
        float y = 2 * pi_num;
        int quotient = (int)(x[i] / y);
        float val = x[i] - (float)quotient * y;

        if (val > pi_num)
        {
            val -= 2 * pi_num;
        }
        else if (val < -pi_num)
        {
            val += 2 * pi_num;
        }

        result[i] = val;
    }
}

// 单个float32x4_t 
static inline float32x4_t range_neg_pi_to_pi_x4(float32x4_t v_x, float pi_num)
{
    float32x4_t v_2pi = vdupq_n_f32(pi_num * 2);
    float32x4_t v_rec_2pi = vdupq_n_f32(1 / (pi_num * 2));
    float32x4_t v_pi = vdupq_n_f32(pi_num);
    float32x4_t v_neg_pi = vdupq_n_f32(-pi_num);

    int32x4_t v_quotient = vcvtq_s32_f32(vmulq_f32(v_x, v_rec_2pi));
    float32x4_t v_remainder = vsubq_f32(v_x, vmulq_f32(vcvtq_f32_s32(v_quotient), v_2pi));

    uint32x4_t v_mask1 = vcgtq_f32(v_remainder, v_pi);
    uint32x4_t v_mask2 = vcltq_f32(v_remainder, v_neg_pi);

    float32x4_t v_re1 = vsubq_f32(v_remainder, v_2pi);
    float32x4_t v_re2 = vaddq_f32(v_remainder, v_2pi);
    v_remainder = vbslq_f32(v_mask1, v_re1, v_remainder);
    v_remainder = vbslq_f32(v_mask2, v_re2, v_remainder);

    return v_remainder;
}

泰勒级数展开

在这里插入图片描述

固定值计算(实际也是泰勒级数展开)

C代码:

// sin
static inline float sinapprox(float val)
{
    float val2 = val * val;
    return val * (0.9999793767f + val2 * (-0.1666243672f + val2 *
          (8.3089787513e-3f + val2 * (-1.9264918228e-4f + val2 *
           2.1478401777e-6f))));
}

//cos
static inline float cosapprox(float val) {
  float val2 = val*val;
  /* Generated in Sollya using:
     > f = remez(cos(x)-1, [|x*x, x*x*x*x, x*x*x*x*x*x, x*x*x*x*x*x*x*x|],
                            [0.000001, pi], 1, 1e-8);
     > plot(f-cos(x)+1, [0, pi]);
     > f+1
  */
  return
    1.f + val2 * (-0.4998515820f + val2 * (4.1518035216e-2f + val2 *
      (-1.3422947025e-3f + val2 * 1.8929864824e-5f)));
}

参考 approxmath

2. NEON实现

sinf函数

#include <arm_neon.h>

#ifndef pi
#define pi 3.1415926535897932384626433832
#endif

// 方法1 泰勒级数展开法
static inline void neon_sin_approx(float32x4_t input, float32x4_t *sin_result)
{
    int iterations = 8;

    float32x4_t term = input;
    *sin_result = input;

    for (int j = 0; j < iterations; j++)
    {
        term = vmulq_f32(term, vmulq_f32(input, input));
        float32x4_t factor = vdupq_n_f32(-1.0f / ((2 * j + 3) * (2 * j + 2)));
        term = vmulq_f32(term, factor);
        *sin_result = vaddq_f32(*sin_result, term);
    }
}

// 方法2 固定值法
static inline void sinapprox_neon_intrinsics(float32x4_t val, float32x4_t *sin_result)
{
    float32x4_t num1 = vdupq_n_f32(0.9999793767f);
    float32x4_t num2 = vdupq_n_f32(-0.1666243672f);
    float32x4_t num3 = vdupq_n_f32(8.3089787513e-3f);
    float32x4_t num4 = vdupq_n_f32(-1.9264918228e-4f);
    float32x4_t num5 = vdupq_n_f32(2.1478401777e-6f);

    float32x4_t val2 = vmulq_f32(val, val);
    *sin_result = vmulq_f32(val, vmlaq_f32(num1, val2, vmlaq_f32(num2, val2, vmlaq_f32(num3, val2, vmlaq_f32(num4, val2, num5)))));
}

static inline void vector_sinf_neon_intrinsics(float *input, float *output, int M)
{// 针对向量input求sinf, M为向量的长度
    int i;
    float32x4_t v_output;
    for (i = 0; i < M - 3; i += 4)
    {
        float32x4_t v_input = vld1q_f32(input + i);
        sinapprox_neon_intrinsics(v_input, &v_output); //  可替换求近似值方法neon_sin_approx(v_input, &v_output);
        vst1q_f32(output + i, v_output);
    }
// 处理剩余元素
    for (; i < M; i++)
    {
        output[i] = sinf(input[i]);
    }
}

cosf函数

// 方法1 泰勒级数展开法
static inline void neon_cos_approx(float32x4_t input, float32x4_t *cos_result)
{
    int j;
    int iterations = 8;

    float32x4_t term = vdupq_n_f32(1.0f);
    *cos_result = vdupq_n_f32(1.0f);

    for (j = 0; j < iterations; j++)
    {
        term = vmulq_f32(term, vmulq_f32(input, input));
        term = vmulq_f32(term, vdupq_n_f32(-1.0f / ((2 * j + 2) * (2 * j + 1))));
        *cos_result = vaddq_f32(*cos_result, term);
    }
}

// 方法2 固定值法
static inline void cosapprox_neon_intrinsics(float32x4_t val, float32x4_t *cos_result)
{
    float32x4_t num1 = vdupq_n_f32(1.f);
    float32x4_t num2 = vdupq_n_f32(-0.4998515820f);
    float32x4_t num3 = vdupq_n_f32(4.1518035216e-2f);
    float32x4_t num4 = vdupq_n_f32(-1.3422947025e-3f);
    float32x4_t num5 = vdupq_n_f32(1.8929864824e-5f);
    float32x4_t val2 = vmulq_f32(val, val);
    *cos_result = vmlaq_f32(num1, val2, vmlaq_f32(num2, val2, vmlaq_f32(num3, val2, vmlaq_f32(num4, val2, num5))));
}

static inline void vector_cosf_neon_intrinsics(float *input, float *output, int M)
{// 针对向量input求sinf, M为向量的长度
    int i;
    float32x4_t v_output;
    for (i = 0; i < M - 3; i += 4)
    {
        float32x4_t v_input = vld1q_f32(input + i);
        cosapprox_neon_intrinsics(v_input, &v_output);//  可替换求近似值方法neon_cos_approx(v_input, &v_output);
        vst1q_f32(output + i, v_output);
    }
// 处理剩余元素
    for (; i < M; i++)
    {
        output[i] = cosf(input[i]);
    }
}

3. 验证

#define pi 3.1415926535897932384626433832

int main {
	int frame_length  = 512;
    float *test_sin1 = malloc(frame_length * sizeof(float));
    float *test_sin2 = malloc(frame_length * sizeof(float));
    float *t_data_pi = malloc(frame_length * sizeof(float));
    float *t_data = malloc(frame_length * sizeof(float));
    float temp_t;
    for (i = 0; i < frame_length; i++)
    {
        t_data[i] = 0.85 * 10 * i;
        test_sin1[i] = cosf(t_data[i]);
    }
    range_neg_pi_to_pi_neon_intrinsics(t_data, t_data_pi , pi, frame_length);
    vector_sinf_neon_intrinsics(t_data_pi , test_sin2, frame_length);
    for (i = 0; i < st->frame_length; i++)
    {
        printf("test cos: %.10f,   %.10f  ,  cos(%.10f)\n", test_sin1[i], test_sin2[i], t_data[i]);
        if (abs(test_sin1[i] - test_sin2[i]) > 0.001)
            printf("Bad   result !!!!!!!!!!!!!!!!!!\n");
    }

    free(test_sin1);
    free(test_sin2);
    free(t_data_pi);
    free(t_data);
    return 0;
    }
在ARM架构中,NEON是一种先进的SIMD(单指令多数据)技术,可以用于加速多媒体和信号处理应用。使用NEON指令集来实现`memcpy`函数可以大幅提升小块数据的拷贝性能,因为NEON能够在一个指令周期内处理多个数据项。 下面是一个简化的例子,展示了如何使用NEON指令集在ARM64架构下实现一个简单的`memcpy`函数。请注意,实际的`memcpy`实现需要考虑内存对齐、错误检查和优化等多种因素,以下代码仅作为演示用途。 ```c #include <arm_neon.h> void* neon_memcpy(void* dest, const void* src, size_t len) { // 如果长度小于NEON处理的数据宽度,则不使用NEON指令 if (len < 16) { return memcpy(dest, src, len); } // 使用NEON指令进行内存拷贝 uint8_t* d = (uint8_t*)dest; const uint8_t* s = (const uint8_t*)src; uint8x16_t v1; for (; (uintptr_t)d & 15; len--) { *d++ = *s++; } size_t loopLen = len - (len % 16); while (loopLen > 0) { // 加载16字节数据到寄存器 v1 = vld1q_u8(s); // 存储寄存器中的数据到内存 vst1q_u8(d, v1); // 更新源和目标指针,并减少长度计数 s += 16; d += 16; loopLen -= 16; } // 如果剩余的数据小于16字节,使用普通内存拷贝 for (; len > 0; len--) { *d++ = *s++; } return dest; } ``` 请注意,这个示例代码可能需要根据实际硬件和编译器进行调整。在实际应用中,还需要考虑对齐、内存保护、缓存一致性以及可能的异常情况处理等问题。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值