##向量内积
- 向量a和b的内积为:$a \cdot b = a_{1} \times b_{1} + a_{2} \times b_{2} + \cdots + a_{n} \times b_{n} $
##利用_mm256_dp_ps实现double型向量内积
double dot(float* a, float* b, int n)
{
double res = 0.0;
int k = n / 4 + 1;
for(int i = 0; i < k; i++)
{
__m256 ai = _mm256_loadu_ps(a + i * 4);//不对齐加载数据
__m256 bi = _mm256_loadu_ps(b + i * 4);
__m256 r = _mm256_dp_ps(ai, bi, 0xF1);
res += r[0];
}
fclose(fp);
return res;
}
##利用_mm256_hadd_ps等实现double型向量内积
double dot1(float* a, float* b, int n)
{
double res = 0.0;
int k = n / 4 + 1;
FILE* fp = fopen("dot1.txt", "w");
for(int i = 0; i < k; i++)
{
__m256 ai = _mm256_loadu_ps(a + i * 4);
__m256 bi = _mm256_loadu_ps(b + i * 4);
__m256 t = _mm256_mul_ps(ai, bi);
__m256 zero = _mm256_setzero_ps();
t = _mm256_hadd_ps(t, zero);
t = _mm256_hadd_ps(t, zero);
res = res + t[0];
}
return res;
}
##调用程序
#include<iostream>
#include<x86intrin.h>
#include<cstdlib>
#include<fpu_control.h>
#include<stdio.h>
#include<time.h>
using namespace std;
int main(int argv, char** argc)
{
int N = atoi(argc[1]);
float* d = (float*)malloc(sizeof(float) * N);
for(int i = 0; i < N; i++)
{
d[i] = (double)rand() / RAND_MAX;
}
clock_t start_time = clock();
double r1 = dot(d, d, N);
clock_t end_time = clock();
printf("%f\n", r1);
printf("pall times = %fs\n" ,(double)(end_time - start_time) / CLOCKS_PER_SEC);
start_time = clock();
double r2 = originalDot(d, d, N);
end_time = clock();
printf("%f\n", r2);
printf("sequence times = %fs\n" ,(double)(end_time - start_time) / CLOCKS_PER_SEC);
return 1;
}
##编译
g++ mcl.cpp -o mcl -msse -mavx -ffloat-store
##运行结果
- 当向量中的数据较大时精度会损失,本机上实测为大于3000时精度就下降,所以在计算的向量之前要对其进行标准化
- 用SSE指令的速度比正常的顺序执行要快两倍左右,不是标准的4倍,因为还有加载数据的io时间
yfzhongchao@yfzhongchao-pc:~/workspace/cpp/mc$ ./mcl 300000000
sse res = 99995252.458352
sse time = 0.476225s
sequence res = 99995252.457381
sequence time = 0.908160s