NEON的vmulq_lane_f32与vmlaq_lane_f32的使用

__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
{
  return __a * __aarch64_vget_lane_f32 (__b, __lane);
}

说明:把a中的每个lane与b中由lane参数指定的lane相乘并返回。

__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
        float32x2_t __c, const int __lane)
{
  return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
}

说明:把b中的每个lane与c中由lane参数指定的lane相乘,然后与a的每个lane相加,并返回。

以如下代码为例:

#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
#include <sys/stat.h>
#include <arm_neon.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>

using namespace std;
using namespace cv;

void altneonmult(const float *matrixA, const float *matrixB, float *matrixR)
{
    float32x4_t a0,a1,a2,a3, b, r;
    a0 = vld1q_f32(matrixA); /* col 0 of matrixA */
    a1 = vld1q_f32(matrixA + 4); /* col 1 of matrixA */
    a2 = vld1q_f32(matrixA + 8); /* col 2 of matrixA */
    a3 = vld1q_f32(matrixA + 12); /* col 3 of matrixA */
    b = vld1q_f32(matrixB); /* load col 0 of matrixB */
    r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
    r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
    r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
    r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
    vst1q_f32(matrixR, r); /* store col 0 of result */
    b = vld1q_f32(matrixB + 4); /* load col 1 of matrixB */
    r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
    r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
    r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
    r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
    vst1q_f32(matrixR + 4, r); /* store col 1 of result */
    b = vld1q_f32(matrixB + 8); /* load col 2 of matrixB */
    r = vmulq_lane_f32(a0, vget_low_f32(b), 0);
    r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
    r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
    r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
    vst1q_f32(matrixR + 8, r); /* store col 2 of result */
    b = vld1q_f32(matrixB + 12); /* load col 3 of matrixB */
    r = vmulq_lane_f32(a0, vget_low_f32(b), 0);

    r = vmlaq_lane_f32(r, a1, vget_low_f32(b), 1);
    r = vmlaq_lane_f32(r, a2, vget_high_f32(b), 0);
    r = vmlaq_lane_f32(r, a3, vget_high_f32(b), 1);
    vst1q_f32(matrixR + 12, r); /* store col 3 of result */
}

int main()
{
    struct timeval tv_start, tv_end;
    Mat matrixA(4, 4, CV_32FC1), matrixB(4, 4, CV_32FC1),matrixR(4, 4, CV_32FC1);

    float *pATmp = (float*)(matrixA.data);
    float *pBTmp = (float*)(matrixB.data);
    float *pRTmp = (float*)(matrixR.data);

    for(int i = 0; i < 4; i++)
    {
        for(int j = 0; j < 4; j++ )
        {
            *pATmp = i*4+j;
            *pBTmp = i*5+j;
            *pRTmp = 0.0f;
            pATmp++;
            pBTmp++;
            pRTmp++;
        }
    }
    ///transpose A and B, let cols align first
    transpose(matrixA, matrixA);
    transpose(matrixB, matrixB);
    pATmp = (float*)(matrixA.data);
    pBTmp = (float*)(matrixB.data);
    for(int i = 0; i < 4; i++)
    {
        for(int j = 0; j < 4; j++ )
        {
            printf("%f ", *pATmp);
            pATmp++;
        }
        printf("\n");
    }
        printf("\nA-------over\n");

    for(int i = 0; i < 4; i++)
    {
        for(int j = 0; j < 4; j++ )
        {
            printf("%f ", *pBTmp);
            pBTmp++;
        }
        printf("\n");
    }
        printf("\nB-------over\n");
    call  altneonmult to calcute the matrix multiply
    gettimeofday(&tv_start, 0);
    altneonmult(  (float*)(matrixA.data), (float*)(matrixB.data), (float*)(matrixR.data));
    gettimeofday(&tv_end, 0);
    printf("using neon %dμs\n", (tv_end.tv_sec * 1000000 + tv_end.tv_usec - tv_start.tv_sec * 1000000 - tv_start.tv_usec));

    pRTmp = (float*)(matrixR.data);
    for(int i = 0; i < 4; i++)
    {
        for(int j = 0; j < 4; j++ )
        {
            printf("%f ", *pRTmp);
            pRTmp++;
        }
        printf("\n");

    }

    ///using standard matrix multiply
    transpose(matrixA, matrixA);
    transpose(matrixB, matrixB);
    pATmp = (float*)(matrixA.data);
    pBTmp = (float*)(matrixB.data);
    pRTmp = (float*)(matrixR.data);
    memset((void*)pRTmp, 0, 4*4*sizeof(float));
    gettimeofday(&tv_start, 0);
    {
        for(int i = 0; i < 4; i++)
        {
            for(int j = 0; j < 4; j++)
            {
                for(int k=0; k < 4; k++)
                {
                    *(pRTmp+4*i+j) += (*(pATmp + 4*i + k)) * (*(pBTmp + k*4 + j));
                }
            }
        }
    }

    gettimeofday(&tv_end, 0);
    printf("using arm general calc %dμs\n", (tv_end.tv_sec * 1000000 + tv_end.tv_usec - tv_start.tv_sec * 1000000 - tv_start.tv_usec));
    pRTmp = (float*)(matrixR.data);
    for(int i = 0; i < 4; i++)
    {
        for(int j = 0; j < 4; j++ )
        {
            printf("%f ", *pRTmp);
            pRTmp++;
        }
        printf("\n");
    }

    return 0;
}

输出的结果如下:

Jelly-Pro:/data/local/tmp/neon # ./a.out                                       
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000
using neon 3μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
Jelly-Pro:/data/local/tmp/neon # ./a.out                                       
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000

A-------over
0.000000 5.000000 10.000000 15.000000
1.000000 6.000000 11.000000 16.000000
2.000000 7.000000 12.000000 17.000000
3.000000 8.000000 13.000000 18.000000

B-------over
using neon 2μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
Jelly-Pro:/data/local/tmp/neon # ./a.out                                       
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000

A-------over
0.000000 5.000000 10.000000 15.000000
1.000000 6.000000 11.000000 16.000000
2.000000 7.000000 12.000000 17.000000
3.000000 8.000000 13.000000 18.000000

B-------over
using neon 2μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
using arm general calc 3μs
140.000000 266.000000 392.000000 518.000000
266.000000 424.000000 582.000000 740.000000
392.000000 582.000000 772.000000 962.000000
518.000000 740.000000 962.000000 1184.000000
Jelly-Pro:/data/local/tmp/neon # ./a.out                                       
WARNING: linker: /data/local/tmp/neon/a.out: unsupported flags DT_FLAGS_1=0x8000001
0.000000 4.000000 8.000000 12.000000
1.000000 5.000000 9.000000 13.000000
2.000000 6.000000 10.000000 14.000000
3.000000 7.000000 11.000000 15.000000

A-------over
0.000000 5.000000 10.000000 15.000000
1.000000 6.000000 11.000000 16.000000
2.000000 7.000000 12.000000 17.000000
3.000000 8.000000 13.000000 18.000000

B-------over
using neon 3μs
70.000000 190.000000 310.000000 430.000000
76.000000 212.000000 348.000000 484.000000
82.000000 234.000000 386.000000 538.000000
88.000000 256.000000 424.000000 592.000000
using arm general calc 3μs
70.000000 76.000000 82.000000 88.000000
190.000000 212.000000 234.000000 256.000000
310.000000 348.000000 386.000000 424.000000
430.000000 484.000000 538.000000 592.000000
Jelly-Pro:/data/local/tmp/neon #


编译选项是:

NDK_ROOT=/home/android-ndk-r16b
CC=$(NDK_ROOT)/build/toolchains/aarch64-linux-android-clang++/bin/aarch64-linux-android-g++

SOURCES := $(shell ls ./*.cpp)
OBJS := $(patsubst %.cpp, %.o, $(SOURCES))
BIN := a.out

INCLUDES := \
    -I $(NDK_ROOT)/sysroot/usr/include/ \
    -I /data_1/songqing/tk1/3rdparty/OpenCV-android-sdk-3.1.0/sdk/native/jni/include

CFLAGS = -O3 -march=armv8-a  -pie -fPIE   -ffast-math
#CFLAGS = -O3 -march=armv8-a  -mcpu=cortex-a8  -mfpu=neom -mfloat-abi=hard -ffast-math

LIBS := \
     -pie -fPIE \
      -L /data_1/songqing/tk1/3rdparty/OpenCV-android-sdk-3.1.0/sdk/native/libs/arm64-v8a \
    -lopencv_core -lopencv_imgproc -lopencv_highgui -lopencv_imgcodecs \
    -L /data_1/songqing/tk1/3rdparty/OpenCV-android-sdk-3.1.0/sdk/native/3rdparty/libs/arm64-v8a \
    -lIlmImf -llibjasper -llibjpeg -llibpng -llibtiff -llibwebp -ltbb \
     -lm -llog -lz

$(BIN): $(OBJS)
    $(CC)  $(OBJS) $(LIBS)  -o $(BIN)

%.o: %.cpp
     $(CC)  $(INCLUDES) $(CFLAGS) -c $< -o $@

 

 


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值