Linux 版的 Intel MKL 的安装使用

最新推荐文章于 2024-05-27 20:44:52 发布

20kg镀金

最新推荐文章于 2024-05-27 20:44:52 发布

阅读量1w

点赞数 1

分类专栏： mkl

mkl 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1.下载

https://software.intel.com/en-us/mkl

链接：https://pan.baidu.com/s/1ysHRNqGOhL72YC7KZXU_uA 密码：8ivh

最新版下载方法请自行研究。

文件名字类似 l_mkl_2017.3.196.tgz

2.安装

1）解压至任意目录（安装后可删除）

2）# ./install.sh

默认安装至 /opt/, 可配置安装路径。

3）在 /etc/ld.so.conf.d 下创建名为 intel-mkl.conf 的文件，内容为

/opt/intel/mkl/lib/intel64
/opt/intel/lib/intel64
然后执行

# ldconfig -v
4) 执行

$ /opt/intel/mkl/bin/mklvars.sh intel64 mod
见：https://software.intel.com/en-us/mkl-linux-developer-guide-scripts-to-set-environment-variables

3.使用
以编译官方文档上的 dgemm_example.c 为例

#define min(x,y) (((x) < (y)) ? (x) : (y))
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"

int main()
{
double *A, *B, *C;
int m, n, p, i, j;
double alpha, beta;

printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
" Intel(R) MKL function dgemm, where A, B, and C are matrices and \n"
" alpha and beta are double precision scalars\n\n");

m = 2000, p = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}

printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*p); i++) {
A[i] = (double)(i+1);
}

for (i = 0; i < (p*n); i++) {
B[i] = (double)(-i-1);
}

for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}

printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
printf ("\n Computations completed.\n\n");

printf (" Top left corner of matrix A: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(p,6); j++) {
printf ("%12.0f", A[j+i*p]);
}
printf ("\n");
}

printf ("\n Top left corner of matrix B: \n");
for (i=0; i<min(p,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.0f", B[j+i*n]);
}
printf ("\n");
}

printf ("\n Top left corner of matrix C: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C[j+i*n]);
}
printf ("\n");
}

printf ("\n Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);

printf (" Example completed. \n\n");
return 0;
}
（代码下载地址：https://software.intel.com/en-us/product-code-samples）
编译命令为：

$ gcc -I/opt/intel/mkl/include dgemm_example.c -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -lm -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64

或者

$ gcc -I/opt/intel/mkl/include dgemm_example.c -lmkl_rt -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64

再或者

$ . /opt/intel/bin/compilervars.sh intel64

$ gcc dgemm_example.c -lmkl_rt

（此方法可行是因为前一个命令设置了环境变量 CPATH，LD_LIBRARY_PATH，LIBRARY_PATH，致使编译器可以找到所需的头文件和库文件。编译C时头文件查找 C_INCLUDE_PATH 中包含目录，C++ 查找 CPLUS_INCLUDE_PATH，C和C++都查找 CPATH）

链接MKL的库的方法见：https://software.intel.com/en-us/mkl-linux-developer-guide-linking-your-application-with-the-intel-math-kernel-library

(学习文档：https://software.intel.com/en-us/get-started-with-mkl-for-linux ,https://software.intel.com/en-us/mkl-linux-developer-guide)

后记：

Intel的MPI库的安装方法与MKL相同，执行 compilervars.sh 之后即可编译使用了MPI库的文件。

gmres_test.c

/* Example to show how to use Intel's FGMRES with preconditioner to solve the linear system Ax=b in MPI.
* Based on Intel's example: solverc/source/fgmres_full_funct_c.c
* For CS51501 HW3 Part b
*
* Please read Intel Reference Manual, Chapter 6 Sparse Solve Routine, FGMRES Interface Description for the detail information.
*/
#include <stdio.h>
#include "mkl.h"
#include "mpi.h"

#define MASTER 0       // taskid of first task
#define RESTART 500
#define TOL 0.00000001
#define MAXIT 1000

void mpi_dgemv(const MKL_INT m, const MKL_INT local_m, const double *A, const double *u, double *v, double *local_u, double *local_v, int taskid, MPI_Comm comm);
void mpi_preconditioner_solver(const MKL_INT m, const MKL_INT local_m, const double *local_M, const double *u, double *v, double *local_u, int taskid, MPI_Comm comm);

int main(int argc, char *argv[])
{
   int taskid;       // a task identifier
   int numtasks;       // number of tasks in partition
   MPI_Comm comm;
   int m;           // size of the matrix
   int local_m;       // rows of matrix A sent to each worker
   double *A, *b, *exact_x, *x;
   double *temp_1, *temp_2;
   double *local_A, *local_v, *local_u;
   double *local_M;   // M is the preconditioner in this example, which is the diagonal element of A;
   int i, j, k;

   MPI_Init(&argc, &argv);
   comm = MPI_COMM_WORLD;
   MPI_Comm_rank(comm, &taskid);
   MPI_Comm_size(comm, &numtasks);
   if (taskid == MASTER) {   // initilization: A and b
       /* start modification 1: read A and b from mtx files in node 0 */
       m = 64;       // size of the matrix
       A = malloc(sizeof(double) * (m * m));
       // !!! A is in col-major
       for (j = 0; j < m; j++)
           for (i = 0; i < m; i++) {
               if (i == j)
                   *(A + j * m + i) = m * 100.0;
               else
                   *(A + j * m + i) = i + 1.0;
           }
       exact_x = malloc(sizeof(double) * m);
       for (i = 0; i < m; i++)
           *(exact_x + i) = 1.0;
       b = malloc(sizeof(double) * m);
       // b=A*ones(n,1)
       cblas_dgemv(CblasColMajor, CblasNoTrans, m, m, 1.0, A, m, exact_x, 1, 0.0, b, 1);
       /* end modification 1 */
   }

   MPI_Bcast(&m, 1, MPI_INT, MASTER, comm);   // send m from node MASTER to all other nodes.
   local_m = m / numtasks;
   local_A = malloc(sizeof(double) * (local_m * m));
   local_u = malloc(sizeof(double) * (local_m));
   local_v = malloc(sizeof(double) * m);
   // partition A and send A_i to local_A on node i
   MPI_Scatter(A, local_m * m, MPI_DOUBLE, local_A, local_m * m, MPI_DOUBLE, MASTER, comm);

   if (taskid == MASTER) {
       free(A);
       free(exact_x);
       // do not free b, it wil be used for GMRES
   }

   /* start modification 2: generate preconditioner M
   * In this example, TA choose the diagonal elements of A as the preconditioner.
   * In HW3 part b, you should generate L and U here.
   */
   local_M = malloc(sizeof(double) * local_m);
   for (i = 0; i < local_m; i++)
       *(local_M + i) = *(local_A + taskid * local_m + i * m + i);
   /* end modification 2 */

   /*---------------------------------------------------------------------------
   * GMRES: Allocate storage for the ?par parameters and the solution vectors
   *---------------------------------------------------------------------------*/
   MKL_INT RCI_request;
   int RCI_flag;
   double dvar;
   int flag = 0;

   MKL_INT ipar[128];   //specifies the integer set of data for the RCI FGMRES computations
   double dpar[128];   // specifies the double precision set of data
   double *tmp;       //used to supply the double precision temporary space for theRCI FGMRES computations, specifically:
   double *computed_solution;
   double *residual;
   double *f;
   MKL_INT itercount, ierr = 0;;
   MKL_INT ivar;
   double b_2norm;
   char cvar = 'N';
   MKL_INT incx = 1;
   if (taskid == MASTER) {
       ipar[14] = RESTART;   // restart iteration number
       int n_tmp = (2 * ipar[14] + 1) * m + ipar[14] * (ipar[14] + 9) / 2 + 1;
       tmp = (double *) malloc(sizeof(double) * n_tmp);
       computed_solution = (double *) malloc(sizeof(double) * m);
       residual = (double *) malloc(sizeof(double) * m);
       f = (double *) malloc(sizeof(double) * m);

       ivar = m;
       /*---------------------------------------------------------------------------
       * Initialize the initial guess
       *---------------------------------------------------------------------------*/
       for (i = 0; i < m; i++) {
           computed_solution[i] = 0.5;
       }

       b_2norm = cblas_dnrm2(ivar, b, incx);
       // printf("b_2norm=%f\n",b_2norm);
       /*---------------------------------------------------------------------------
       * Initialize the solver
       *---------------------------------------------------------------------------*/
       dfgmres_init(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);
       RCI_flag = RCI_request;
   }
   MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);
   if (RCI_flag != 0)
       goto FAILED;

   if (taskid == MASTER) {
       /*---------------------------------------------------------------------------
       * GMRES: Set the desired parameters:
       *---------------------------------------------------------------------------*/
       ipar[14] = RESTART;   // restart iteration number
       ipar[7] = 1;   //do the stopping test
       ipar[10] = 1;   // use preconditioner
       dpar[0] = TOL;
       /*---------------------------------------------------------------------------
       * Check the correctness and consistency of the newly set parameters
       *---------------------------------------------------------------------------*/
       dfgmres_check(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);
       RCI_flag = RCI_request;
   }

   MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);
   if (RCI_flag != 0)
       goto FAILED;

   if (taskid == MASTER) {
       /*---------------------------------------------------------------------------
       * Print the info about the RCI FGMRES method
       *---------------------------------------------------------------------------*/
       printf("Some info about the current run of RCI FGMRES method:\n\n");
       if (ipar[7]) {
           printf("As ipar[7]=%d, the automatic test for the maximal number of ", ipar[7]);
           printf("iterations will be\nperformed\n");
       } else {
           printf("As ipar[7]=%d, the automatic test for the maximal number of ", ipar[7]);
           printf("iterations will be\nskipped\n");
       }
       printf("+++\n");
       if (ipar[8]) {
           printf("As ipar[8]=%d, the automatic residual test will be performed\n", ipar[8]);
       } else {
           printf("As ipar[8]=%d, the automatic residual test will be skipped\n", ipar[8]);
       }
       printf("+++\n");
       if (ipar[9]) {
           printf("As ipar[9]=%d, the user-defined stopping test will be ", ipar[9]);
           printf("requested via\nRCI_request=2\n");
       } else {
           printf("As ipar[9]=%d, the user-defined stopping test will not be ", ipar[9]);
           printf("requested, thus,\nRCI_request will not take the value 2\n");
       }
       printf("+++\n");
       if (ipar[10]) {
           printf("As ipar[10]=%d, the Preconditioned FGMRES iterations will be ", ipar[10]);
           printf("performed, thus,\nthe preconditioner action will be requested via ");
           printf("RCI_request=3\n");
       } else {
           printf("As ipar[10]=%d, the Preconditioned FGMRES iterations will not ", ipar[10]);
           printf("be performed,\nthus, RCI_request will not take the value 3\n");
       }
       printf("+++\n");
       if (ipar[11]) {
           printf("As ipar[11]=%d, the automatic test for the norm of the next ", ipar[11]);
           printf("generated vector is\nnot equal to zero up to rounding and ");
           printf("computational errors will be performed,\nthus, RCI_request will not ");
           printf("take the value 4\n");
       } else {
           printf("As ipar[11]=%d, the automatic test for the norm of the next ", ipar[11]);
           printf("generated vector is\nnot equal to zero up to rounding and ");
           printf("computational errors will be skipped,\nthus, the user-defined test ");
           printf("will be requested via RCI_request=4\n");
       }
       printf("+++\n\n");
   }
   /*---------------------------------------------------------------------------
   * Compute the solution by RCI (P)FGMRES solver with preconditioning
   * Reverse Communication starts here
   *---------------------------------------------------------------------------*/
ONE:
   if (taskid == MASTER) {
       dfgmres(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);
       RCI_flag = RCI_request;
   }
   MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);   // send RCI_request from node MASTER to all other nodes.

   /*---------------------------------------------------------------------------
   * If RCI_request=0, then the solution was found with the required precision
   *---------------------------------------------------------------------------*/
   if (RCI_flag == 0)
       goto COMPLETE;
   /*---------------------------------------------------------------------------
   * If RCI_request=1, then compute the vector A*tmp[ipar[21]-1]
   * and put the result in vector tmp[ipar[22]-1]
   *---------------------------------------------------------------------------
   * NOTE that ipar[21] and ipar[22] contain FORTRAN style addresses,
   * therefore, in C code it is required to subtract 1 from them to get C style
   * addresses
   *---------------------------------------------------------------------------*/
   if (RCI_flag == 1) {
       if (taskid == MASTER) {
           temp_1 = &tmp[ipar[21] - 1];
           temp_2 = &tmp[ipar[22] - 1];
       }

       mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);

       goto ONE;
   }
   /*---------------------------------------------------------------------------
   * If RCI_request=2, then do the user-defined stopping test
   * The residual stopping test for the computed solution is performed here
   *---------------------------------------------------------------------------
   */
   if (RCI_flag == 2) {
       /* Request to the dfgmres_get routine to put the solution into b[N] via ipar[12]
       --------------------------------------------------------------------------------
       WARNING: beware that the call to dfgmres_get routine with ipar[12]=0 at this
       stage may destroy the convergence of the FGMRES method, therefore, only
       advanced users should exploit this option with care */
       if (taskid == MASTER) {
           ipar[12] = 1;
           /* Get the current FGMRES solution in the vector f */
           dfgmres_get(&ivar, computed_solution, f, &RCI_request, ipar, dpar, tmp, &itercount);
           temp_1 = f;
           temp_2 = residual;
       }
       /* Compute the current true residual via mpi mat_vec multiplication */
       mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);

       if (taskid == MASTER) {
           dvar = -1.0E0;
           cblas_daxpy(ivar, dvar, b, incx, residual, incx);
           dvar = cblas_dnrm2(ivar, residual, incx);
           printf("iteration %d, relative residual:%e\n", itercount, dvar);
       }

       MPI_Bcast(&dvar, 1, MPI_DOUBLE, MASTER, comm);
       if (dvar < TOL) {
           goto COMPLETE;
       } else
           goto ONE;
   }
   /*---------------------------------------------------------------------------
   * If RCI_request=3, then apply the preconditioner on the vector
   * tmp[ipar[21]-1] and put the result in vector tmp[ipar[22]-1]
   *---------------------------------------------------------------------------
   * NOTE that ipar[21] and ipar[22] contain FORTRAN style addresses,
   * therefore, in C code it is required to subtract 1 from them to get C style
   * addresses
   *---------------------------------------------------------------------------*/
   if (RCI_flag == 3) {
       if (taskid == MASTER) {
           temp_1 = &tmp[ipar[21] - 1];
           temp_2 = &tmp[ipar[22] - 1];
       }
       /* start modification 3: solve L U temp_2 = temp_1 */
       mpi_preconditioner_solver(m, local_m, local_M, temp_1, temp_2, local_u, taskid, comm);
       /* end modification 3 */
       goto ONE;
   }
   /*---------------------------------------------------------------------------
   * If RCI_request=4, then check if the norm of the next generated vector is
   * not zero up to rounding and computational errors. The norm is contained
   * in dpar[6] parameter
   *---------------------------------------------------------------------------*/
   if (RCI_flag == 4) {
       if (taskid == MASTER)
           dvar = dpar[6];
       MPI_Bcast(&dvar, 1, MPI_DOUBLE, MASTER, comm);
       if (dvar < 1.0E-12) {
           goto COMPLETE;
       } else
           goto ONE;
   }
   /*---------------------------------------------------------------------------
   * If RCI_request=anything else, then dfgmres subroutine failed
   * to compute the solution vector: computed_solution[N]
   *---------------------------------------------------------------------------*/
   else {
       goto FAILED;
   }
   /*---------------------------------------------------------------------------
   * Reverse Communication ends here
   * Get the current iteration number and the FGMRES solution (DO NOT FORGET to
   * call dfgmres_get routine as computed_solution is still containing
   * the initial guess!). Request to dfgmres_get to put the solution
   * into vector computed_solution[N] via ipar[12]
   *---------------------------------------------------------------------------*/
COMPLETE:if (taskid == MASTER) {
       ipar[12] = 0;
       dfgmres_get(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp, &itercount);
           /*---------------------------------------------------------------------------
           * Print solution vector: computed_solution[N] and the number of iterations: itercount
           *---------------------------------------------------------------------------*/
       printf("The system has been solved in %d iterations \n", itercount);
       printf("The following solution has been obtained (first 4 elements): \n");
       for (i = 0; i < 4; i++) {
           printf("computed_solution[%d]=", i);
           printf("%e\n", computed_solution[i]);
       }

           /*-------------------------------------------------------------------------*/
       /* Release internal MKL memory that might be used for computations */
       /* NOTE: It is important to call the routine below to avoid memory leaks */
       /* unless you disable MKL Memory Manager */
           /*-------------------------------------------------------------------------*/
       MKL_Free_Buffers();
       temp_1 = computed_solution;
       temp_2 = residual;
   }
   // compute the relative residual
   mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);
   if (taskid == MASTER) {
       dvar = -1.0E0;
       cblas_daxpy(ivar, dvar, b, incx, residual, incx);
       dvar = cblas_dnrm2(ivar, residual, incx);
       printf("relative residual:%e\n", dvar / b_2norm);

       if (itercount < MAXIT && dvar < TOL)
           flag = 0;   //success
       else
           flag = 1;   //fail

   }

   MPI_Bcast(&flag, 1, MPI_INT, MASTER, comm);

   free(local_A);
   free(local_M);
   free(local_u);
   free(local_v);
   if (taskid == MASTER) {
       free(tmp);
       free(b);
       free(computed_solution);
       free(residual);
   }

   if (flag == 0) {
       MPI_Finalize();
       return 0;
   } else {
       MPI_Finalize();
       return 1;
   }
   /* Release internal MKL memory that might be used for computations */
   /* NOTE: It is important to call the routine below to avoid memory leaks */
   /* unless you disable MKL Memory Manager */
       /*-------------------------------------------------------------------------*/
FAILED:
   if (taskid == MASTER) {
       printf("\nThis example FAILED as the solver has returned the ERROR code %d", RCI_request);
       MKL_Free_Buffers();
   }
   free(local_A);
   free(local_M);
   free(local_u);
   free(local_v);
   if (taskid == MASTER) {
       free(tmp);
       free(b);
       free(computed_solution);
       free(residual);
   }

   MPI_Finalize();
   return 1;
}

void mpi_dgemv(const MKL_INT m, const MKL_INT local_m, const double *local_A, const double *u, double *v, double *local_u, double *local_v, int taskid, MPI_Comm comm)
{
   // compute v=A*u in MPI
   CBLAS_LAYOUT layout = CblasColMajor;   //col major
   CBLAS_TRANSPOSE trans = CblasNoTrans;   // no transfer

   MPI_Scatter(u, local_m, MPI_DOUBLE, local_u, local_m, MPI_DOUBLE, MASTER, comm);   // send u_i from node MASTER to all other nodes.
   // printf("scatter finish at taskid=%d\n",taskid);
   // compute A_i
   cblas_dgemv(layout, trans, m, local_m, 1.0, local_A, m, local_u, 1, 0.0, local_v, 1);

   // Apply a reduction operation on all nodes and place the result in vector v.
   MPI_Reduce(local_v, v, m, MPI_DOUBLE, MPI_SUM, MASTER, comm);
}

void mpi_preconditioner_solver(const MKL_INT m, const MKL_INT local_m, const double *local_M, const double *u, double *v, double *local_u, int taskid, MPI_Comm comm)
{
   int i = 0;
   // printf("begin taskid=%d\n",taskid);
   MPI_Scatter(u, local_m, MPI_DOUBLE, local_u, local_m, MPI_DOUBLE, MASTER, comm);   // send u_i from node MASTER to all other nodes.
   // printf("taskid=%d\n",taskid);
   //compute Mi^(-1)*y_i at each node
   for (i = 0; i < local_m; i++)
       *(local_u + i) /= *(local_M + i);

   // Apply a gather operation on all nodes
   MPI_Gather(local_u, local_m, MPI_DOUBLE, v, local_m, MPI_DOUBLE, MASTER, comm);
}

$ . /opt/intel/bin/compilervars.sh intel64
$ mpicc gmres_test.c -o gmres_test -lmkl_rt

---------------------
作者：chenjun15
来源：CSDN
原文：https://blog.csdn.net/chenjun15/article/details/75041932
版权声明：本文为博主原创文章，转载请附上博文链接！

20kg镀金

关注

1
点赞
踩
15

收藏

觉得还不错? 一键收藏
1
评论
Linux 版的 Intel MKL 的安装使用

1.下载https://software.intel.com/en-us/mkl链接：https://pan.baidu.com/s/1ysHRNqGOhL72YC7KZXU_uA 密码：8ivh最新版下载方法请自行研究。文件名字类似 l_mkl_2017.3.196.tgz2.安装1）解压至任意目录（安装后可删除）2）# ./install.sh默认安装至 /op...
复制链接

扫一扫

专栏目录