#include "stdio.h"
#include "stdlib.h"
#include<math.h>
#include<time.h>
#include<ctime>
#include "omp.h"
#include "windows.h"
#define ROW1 256 //左矩阵行数
#define COL1 256 //左矩阵列数
#define ROW2 256 //右矩阵行数
#define COL2 256 //右矩阵列数
#define MAX_THREADS 128 //最大线程数
static int a[ROW1][COL1];
static int b[ROW2][COL2];
static int c[ROW1][COL2];
double dqFreq;
void initArray();
void init_c_array();
void print(int* array, int row, int col);
LARGE_INTEGER now_time_count();
double cost_time_s(LARGE_INTEGER start_time_count, LARGE_INTEGER end_time_count);
int autoTuning();
int main()
{
omp_set_num_threads(MAX_THREADS);
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
dqFreq = (double)f.QuadPart;
initArray();
autoTuning();
return 0;
}
/** 函数作用 :初始化左右矩阵和结果矩阵
* 函数输入参数 :空
* 函数返回值 :空
* 补充:
*/
void initArray()
{
int i, j;
srand((unsigned int)time(NULL));
for (i = 0; i < ROW1; i++)
{
for (j = 0; j < COL1; j++)
{
a[i][j] = rand() % 100;
}
}
for (i = 0; i < ROW2; i++)
{
for (j = 0; j < COL2; j++)
{
b[i][j] = rand() % 100;
}
}
for (i = 0; i < ROW1; i++)
{
for (j = 0; j < COL2; j++)
{
c[i][j] = 0;
}
}
}
/** 函数作用 :清空结果矩阵
* 函数输入参数 :空
* 函数返回值 :空
* 补充:
*/
void init_c_array()
{
for (int i = 0; i < ROW1; i++)
{
for (int j = 0; j < COL2; j++)
{
c[i][j] = 0;
}
}
}
/** 函数作用 :
* 函数输入参数 :
* 函数返回值 :
* 补充:
*/
int autoTuning()
{
LARGE_INTEGER start_time_count_serial = now_time_count();
init_c_array();
for (int i = 0; i < ROW1; i++)
for (int j = 0; j < COL2; j++)
for (int k = 0; k < COL1; k++)
c[i][j] += a[i][k] * b[k][j];
LARGE_INTEGER end_time_count_serial = now_time_count();
double time_cost_s = cost_time_s(start_time_count_serial, end_time_count_serial);
printf("进行串行计算:大矩阵相乘总用时%15.13fs\n", time_cost_s);
double min_time_cost_s = 1.79769313486231570E+308; int min_time_threads = 0;
for (int num_threads = 8; num_threads <= MAX_THREADS; num_threads++)
{
init_c_array();
LARGE_INTEGER start_time_count = now_time_count();
omp_set_num_threads(num_threads);
#pragma omp parallel for collapse(2)
for (int i = 0; i < ROW1; i++)
for (int j = 0; j < COL2; j++)
for (int k = 0; k < COL1; k++)
c[i][j] += a[i][k] * b[k][j];
LARGE_INTEGER end_time_count = now_time_count();
double time_cost_s = cost_time_s(start_time_count, end_time_count);
printf("线程数:%3d,大矩阵相乘总用时%15.13fs\n", num_threads, time_cost_s);
if (time_cost_s < min_time_cost_s) {
min_time_cost_s = time_cost_s;
min_time_threads = num_threads;
}
}
printf("%d行%d列矩阵与%d行%d列矩阵相乘\n处理器数%d\n大矩阵相乘最佳线程数为:%d\n总用时%15.13fs\n", ROW1, COL1, ROW2, COL2, omp_get_num_procs(), min_time_threads, min_time_cost_s);
return min_time_threads;
}
/** 函数作用 :返回开始/结束时刻计数器的值
* 函数输入参数 :空
* 函数返回值 :计数器的值
* 补充:也可以使用omp_get_wtime()计算时间
*/
LARGE_INTEGER now_time_count()
{
LARGE_INTEGER time_now_count;
QueryPerformanceCounter(&time_now_count);
return time_now_count;
}
/** 函数作用 :计算时间间隔
* 函数输入参数 start_time : 开始计数器的值
* 函数输入参数 end_time_count : 结束计数器的值
* 函数返回值 :时间间隔,s
* 补充:也可以使用omp_get_wtime()计算时间
*/
double cost_time_s(LARGE_INTEGER start_time_count, LARGE_INTEGER end_time_count)
{
double run_time_s = (end_time_count.QuadPart - start_time_count.QuadPart) / dqFreq;
return run_time_s;
}