#include <stdio.h>
#include <time.h>
#include <x86intrin.h>
void matmul_avx(const float *x, const float **w,float *y,const int col,const int row){
const int col_reduced_8 = col - col % 8;
float scratchpad[8];
__m256 op0, op1, tgt, tmp_vec;
for (int i = 0; i < row; i++) {
float res = 0;
tgt = _mm256_setzero_ps();
for (int j = 0; j < col_reduced_8; j += 8) {
op0 = __builtin_ia32_loadups256(&x[j]);
op1 = __builtin_ia32_loadups256(&w[i][j]);
tmp_vec = __builtin_ia32_mulps256(op0, op1);
tgt = __builtin_ia32_addps256(tmp_vec, tgt);
}
__builtin_ia32_storeups256(scratchpad, tgt);
for (int k = 0; k < 8; k++)
res += scratchpad[k];
for (int l = col_reduced_8; l < col; l++) {
res += w[i][l] *
AVX指令集矩阵乘向量算法
最新推荐文章于 2024-05-13 11:13:32 发布