Env
使用 intrinsics
需要的 cxxflags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1 -march=native -mfma")
Case1
通过计算 向量乘法 演示了Intel AVX intrinsics 的显式使用
Class Declaration
#include <utils.hpp>
#include <vector>
#include <random>
#include <array>
#include <boost/align/aligned_allocator.hpp>
#include <x86intrin.h>
class TestCase_1 {
inline static std::random_device dev; //NOLINT
inline static auto engine = std::default_random_engine(dev()); //NOLINT
inline static auto distribution = std::uniform_real_distribution<float>(0, 100); //NOLINT
inline static auto GetRandomNumber = [] {
return distribution(engine);
};
static constexpr auto default_size = 32768;
template<class T>
using vector = std::vector<float, boost::alignment::aligned_allocator<float, 32>>;
vector<float> a_, b_, c_;
public:
TestCase_1()
: a_(vector<float>(default_size))
, b_(vector<float>(default_size))
, c_(vector<float>(default_size))
{
std::array<vector<float>*, 3> list = {&a_, &b_, &c_};
for (auto& i: list) {
for (int j = 0; j < default_size; ++j) {
(*i)[j] = GetRandomNumber();
}
}
}
void multiply_1();
void multiply_2();
void multiply_3();
};
一般代码
一般代码如下, 直接使用基本算法计算2个向量a和b, 结果放在c中
void multiply_1() {
LOG_DEBUG_TIME();
for (auto i = 0; i < default_size; ++i) {
c_[i] = a_[i] * b_[i];
}
}
一次循环计算4次
LOG_DEBUG_TIME();
for (auto i = 0; i < (default_size & (~0x3u)); i += 4) {
c_[i] = a_[i] * b_[i];
c_[i+1] = a_[i+1] * b_[i+1];
c_[i+2] = a_[i+2] * b_[i+2];
c_[i+3] = a_[i+3] * b_[i+3];
}
使用AVX指令
AVX指令要求内存地址对齐于32字节边界,这里使用了 boost::alignment::aligned_allocator
作为内存分配器
LOG_DEBUG_TIME();
__m256 A, B, C;
for (auto i = 0; i < (default_size & (~0x7u)); i += 8) {
A = _mm256_load_ps(&a_[i]);
B = _mm256_load_ps(&b_[i]);
C = _mm256_mul_ps(A, B);
_mm256_store_ps(&c_[i], C);
}
Output
<void TestCase_1::multiply_1()> 185
<void TestCase_1::multiply_2()> 181
<void TestCase_1::multiply_3()> 44