前言
单列数组下,AVX比SSE稍慢!
math_function.h
#pragma once
#include <immintrin.h>
#include <stdio.h>
float MathMax(const float *input, int size);
float SSEMax(const float *input, int size);
float AVXMax(const float *input, int size);
math_function.cpp
#include "math_function.h"
float MathMax(const float *input, int size)
{
float maxVal = input[0];
for (int i = 1; i < size; i++)
{
maxVal = maxVal > input[i] ? maxVal : input[i];
}
return maxVal;
}
float SSEMax(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 4;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
__declspec(align(16)) float output[4];
__m128 loadData;
const float *p = input;
__m128 maxVal = _mm_load_ps(p);
p += nBlockWidth;
for (int i = 1; i < cntBlock; i++)
{
loadData = _mm_load_ps(p);
maxVal = _mm_max_ps(maxVal, loadData);
p += nBlockWidth;
}
_mm_store_ps(output, maxVal);
float maxVal_ = output[0];
for (int i = 1; i < 4; i++)
{
maxVal_ = maxVal_ > output[i] ? maxVal_ : output[i];
}
for (int i = 0; i < cntRem; i++)
{
maxVal_ = maxVal_ > p[i] ? maxVal_ : p[i];
}
return maxVal_;
}
float AVXMax(const float *input, int size)
{
if (input == nullptr)
{
printf("input data is null\n");
return -1;
}
int nBlockWidth = 8;
int cntBlock = size / nBlockWidth;
int cntRem = size % nBlockWidth;
__declspec(align(32)) float output[8];
__m256 loadData;
const float *p = input;
__m256 maxVal = _mm256_load_ps(p);
p += nBlockWidth;
for (int i = 1; i < cntBlock; i++)
{
loadData = _mm256_load_ps(p);
maxVal = _mm256_max_ps(maxVal, loadData);
p += nBlockWidth;
}
_mm256_store_ps(output, maxVal);
float maxVal_ = output[0];
for (int i = 1; i < 8; i++)
{
maxVal_ = maxVal_ > output[i] ? maxVal_ : output[i];
}
for (int i = 0; i < cntRem; i++)
{
maxVal_ = maxVal_ > p[i] ? maxVal_ : p[i];
}
return maxVal_;
}
main.cpp
#include "math_function.h"
#include <random>
#include <time.h>
using std::default_random_engine;
using std::uniform_real_distribution;
int main(int argc, char* argv[])
{
int size = 58;
float *input = (float *)malloc(sizeof(float) * size);
default_random_engine e;
uniform_real_distribution<float> u(0, 3); //随机数分布对象
for (int i = 0; i < size; i++)
{
input[i] = u(e);
printf("%f ", input[i]);
if ((i + 1) % 8 == 0)
printf("\n");
}
printf("\n");
int cntLoop = 100000000;
clock_t start_t = clock();
float org;
for (int i = 0; i < cntLoop; i++)
org = MathMax(input, size);
printf("org = %f\t", org);
printf("cost time: %d(ms)\n", clock() - start_t);
start_t = clock();
float sse;
for (int i = 0; i < cntLoop; i++)
sse = SSEMax(input, size);
printf("sse = %f\t", sse);
printf("cost time: %d(ms)\n", clock() - start_t);
start_t = clock();
float avx;
for (int i = 0; i < cntLoop; i++)
avx = AVXMax(input, size);
printf("avx = %f\t", avx);
printf("cost time: %d(ms)\n", clock() - start_t);
getchar();
free(input);
return 0;
}
运行结果
测试硬件:CPU-i7-9700K
预处理器:_WINDOWS
命令行:/arch:AVX
优化项:/O2
任何问题请加唯一QQ2258205918(名称samylee)!
或唯一VX:samylee_csdn