ICL Auto Vectorization

最新推荐文章于 2020-09-09 17:13:44 发布

aban-mtd

最新推荐文章于 2020-09-09 17:13:44 发布

阅读量1.8k

点赞数

分类专栏：代码片段笔记一步步做程序优化 C/C++ 文章标签：英特尔 c语言 c++ 向量化

本文链接：https://blog.csdn.net/bendanban/article/details/49969255

版权

C/C++ 同时被 3 个专栏收录

63 篇文章 0 订阅

订阅专栏

笔记

46 篇文章 0 订阅

订阅专栏

代码片段

13 篇文章 1 订阅

订阅专栏

简介

此文简单介绍如何使用intel c++编译器实现向量化加速。

全文如下安排：

base ：待优化的源代码。
vectorization ：第一个向量化版本。
aligned ：内存对其对向量化的影响。

base

base版本代码：

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;


int64_t cpu_freq;
int64_t cpu_counter(){
  int64_t clock;
    QueryPerformanceCounter((LARGE_INTEGER*)&clock);
  return clock;
}

// output time
#if 1
  int64_t gloabel_timer_begin;
  int64_t gloabel_timer_end;
  #define TB__ gloabel_timer_begin=cpu_counter()
  #define TE__ gloabel_timer_end  =cpu_counter(); \
  cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else
  #define TB__ 
  #define TE__ 
#endif

// repeat times
#define REPEATTIMES 100000

// initialize data 
void init(float *data, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      data[i*true_cols+j] = float(rand())/float(RAND_MAX);
    }
  }  
}

void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);

void print_sum(float *data, int rows, int cols, int true_cols){
  float total = 0;
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      total += data[i*true_cols+j];
    }
  }
  cout << total << endl;
}

int main(){
  QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);

  int rows = 100; 
  int cols = 101;

  int true_cols = cols;
  float *A = (float*)malloc(rows*true_cols*sizeof(float));
  float *B = (float*)malloc(rows*sizeof(float));
  float *C = (float*)malloc(rows*sizeof(float));

  init(A, rows, cols, true_cols);
  init(B, rows, 1, 1);

  // computing
  TB__;
  for (int k = 0; k < REPEATTIMES; k++){
    multiply(C, A, B, rows, cols, true_cols);
  }
  TE__;

  // print result.  
  print_sum(C, rows, 1, 1);

  free(A); A = NULL;
  free(B); B = NULL;
  free(C); C = NULL;

  return 0;
}

// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    C[i] = 0;
    for (int j = 0; j < cols; j++){
      C[i] += A[i*true_cols+j]*B[j];
    }    
  }
}

编译：

user@machine> icl /O1 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

user@machine> main.exe
73 : 0.877882 seconds
2483.53

vectorization

源代码保持不变

编译：

user@machine> icl /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

user@machine> main.exe
73 : 0.205989 seconds
2483.53

执行速度提升了 4倍左右。

aligned

源代码修改。（注意：下面的代码有问题，结果可能有错误，原因可能是内存的问题。）

// filename : main.cpp
#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <cstdint>
#include <malloc.h>
#include <windows.h>
using namespace std;


int64_t cpu_freq;
int64_t cpu_counter(){
  int64_t clock;
    QueryPerformanceCounter((LARGE_INTEGER*)&clock);
  return clock;
}

// output time
#if 1
  int64_t gloabel_timer_begin;
  int64_t gloabel_timer_end;
  #define TB__ gloabel_timer_begin=cpu_counter()
  #define TE__ gloabel_timer_end  =cpu_counter(); \
  cout << __LINE__ << " : " << double(gloabel_timer_end-gloabel_timer_begin)/double(cpu_freq) << " seconds" << endl
#else
  #define TB__ 
  #define TE__ 
#endif

// repeat times
#define REPEATTIMES 100000

// initialize data 
void init(float *data, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      data[i*true_cols+j] = float(rand())/float(RAND_MAX);
    }
  }  
}

void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols);

void print_sum(float *data, int rows, int cols, int true_cols){
  float total = 0;
  for (int i = 0; i < rows; i++){
    for (int j = 0; j < cols; j++){
      total += data[i*true_cols+j];
    }
  }
  cout << total << endl;
}

int main(){
  QueryPerformanceFrequency((LARGE_INTEGER *)&cpu_freq);

  int rows = 100; 
  int cols = 101;

#ifdef ALIGNED
  #define ALLIGNED_LEN 32
  int true_cols = ((((cols*sizeof(float))+ALLIGNED_LEN-1)/ALLIGNED_LEN)*ALLIGNED_LEN)/sizeof(float);
  //cout << true_cols << endl;
  float *A = (float*)_aligned_malloc(rows*true_cols*sizeof(float), ALLIGNED_LEN);
  float *B = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
  float *C = (float*)_aligned_malloc(rows*sizeof(float), ALLIGNED_LEN);
#else
  int true_cols = cols;
  float *A = (float*)malloc(rows*true_cols*sizeof(float));
  float *B = (float*)malloc(rows*sizeof(float));
  float *C = (float*)malloc(rows*sizeof(float));
#endif

  init(A, rows, cols, true_cols);
  init(B, rows, 1, 1);

  // computing
  TB__;
  for (int k = 0; k < REPEATTIMES; k++){
    multiply(C, A, B, rows, cols, true_cols);
  }
  TE__;

  // print result.  
  print_sum(C, rows, 1, 1);

#ifdef ALIGNED
  _aligned_free(A); A = NULL;
  _aligned_free(B); B = NULL;
  _aligned_free(C); C = NULL;
#else
  free(A); A = NULL;
  free(B); B = NULL;
  free(C); C = NULL;
#endif

  return 0;
}

// filename : multiply.cpp
void multiply(float *C, float *A, float *B, int rows, int cols, int true_cols){
  for (int i = 0; i < rows; i++){
    C[i] = 0;
    #ifdef ALIGNED
    #pragma vector aligned
    #endif
    for (int j = 0; j < cols; j++){
      C[i] += A[i*true_cols+j]*B[j];
    }    
  }
}

编译：

user@machine> icl /DALIGNED /O2 /Qopt-report:1 /Qopt-report-phase:vec main.cpp multiply.cpp

执行：

82 : 0.17747 seconds
2483.53

相对第一个优化的版本又提升了一点速度。

结论

vectorization版本：不需要改变源代码，通过修改编译器选项直接实现向量化。
aligned版本：需要修改代码，使得内存对其。可以进一步获得性能。

aban-mtd

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
ICL Auto Vectorization

简介此文简单介绍如何使用intel c++编译器实现向量化加速。全文如下安排：base ：待优化的源代码。vectorization ：第一个向量化版本。aligned ：内存对其对向量化的影响。basebase版本代码：// filename : main.cpp#include <iostream>#include <iomanip>#include <stdlib.h>#
复制链接

扫一扫