平台: 银河二超算系统
CPU 型号 Intel Xeon E5-2692 12C 2.200GHz
采用 TH Express-2 高速互连
Mpi版本:MPI/Intel/MPICH/3.2-icc14-dyn
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mpi.h"
#include <cmath>
#define MATRIX_ROW pow(2,12)
#define MATRIX_COM pow(2,12)
void Mat_vect_mul(
double local_A[], // in
double local_x[], // in
double local_y[], // out
size_t local_m, // in
size_t n, // in
size_t local_n, // in
MPI_Comm comm // in
)
{
double* x = (double*)malloc(n * sizeof(double));
size_t local_i, j;
size_t local_ok = 1;
MPI_Allgather(local_x, local_n, MPI_DOUBLE, x, local_n, MPI_DOUBLE, comm);
for (local_i = 0; local_i < local_m; local_i++) {
local_y[local_i] = 0.0;
for (j = 0; j < n; j++)
local_y[local_i] += local_A[local_i * n + j] * x[j];
}
free(x);
}
void generateA(double* global_A, size_t size)
{
double temp = 1.0;
size_t i = 0;
for (i = 0; i < size; i++)
{
global_A[i] = temp;
temp += 0.1;
}
}
void generateLocalA2(double* local_A, int my_rank, size_t size)
{
double temp = 1.0;
size_t i = 0;
for (i = 0; i < size; i++)
{
local_A[i] = temp;
temp += 0.1;
}
}
void generateX(double* global_x, size_t size)
{
double temp = 1.0;
size_t i = 0;
for (i = 0; i < size; i++)
global_x[i] = temp;
}
void generateLocalA(double* local_A, double* global_A, size_t my_rank, size_t n, size_t local_m) {
size_t j = 0;
size_t i = 0;
for (i = my_rank * n; i < (my_rank + local_m) * n; i++)
{
local_A[j] = global_A[i];
j++;
}
}
void generateLocalX(double* local_x, double* global_x, size_t my_rank, size_t local_n) {
size_t j = 0;
size_t i = 0;
for (i = my_rank * local_n; i < (my_rank + 1) * local_n; i++)
{
local_x[j] = global_x[i];
j++;
}
}
int main(void) {
int comm_size;
int my_rank;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// global values
size_t m = MATRIX_ROW;
size_t n = MATRIX_COM;
// double* global_A = (double*)malloc(m * n * sizeof(double));
// if (!global_A) { perror("malloc arr"); exit(EXIT_FAILURE); };
double* global_x = (double*)malloc(n * sizeof(double));
if (!global_x) { perror("malloc arr"); exit(EXIT_FAILURE); };
double* global_y = (double*)malloc(m * sizeof(double));
if (!global_y) { perror("malloc arr"); exit(EXIT_FAILURE); };
// generateA(global_A, m * n);
generateX(global_x, n);
// local values
size_t local_m = m / comm_size;
size_t local_n = n / comm_size;
double* local_A = (double*)malloc(local_m * n * sizeof(double));
if (!local_A) { perror("malloc arr"); exit(EXIT_FAILURE); };
double* local_x = (double*)malloc(local_n * sizeof(double));
if (!local_x) { perror("malloc arr"); exit(EXIT_FAILURE); };
double* local_y = (double*)malloc(local_m * sizeof(double));
if (!local_y) { perror("malloc arr"); exit(EXIT_FAILURE); };
// generateLocalA(local_A, global_A, my_rank, n, local_m);
generateLocalA2(local_A, my_rank, local_m * n);
generateLocalX(local_x, global_x, my_rank, local_n);
double start, end;
if (my_rank == 0) {
// Time Begin
start = MPI_Wtime();
}
Mat_vect_mul(local_A, local_x, local_y, local_m, n, local_n, MPI_COMM_WORLD);
double* y = (double*)malloc(m * sizeof(double));
MPI_Allgather(local_y, local_m, MPI_DOUBLE, y, local_m, MPI_DOUBLE, MPI_COMM_WORLD);
if (my_rank == 0) {
end = MPI_Wtime();
double time = end - start;
printf("Time cost is --%f-- seconds.\n", time);
}
// free(global_A);
free(global_x);
free(global_y);
free(local_A);
free(local_x);
free(local_y);
free(y);
MPI_Finalize();
return 0;
}
运行命令:
mpicc -o multi_4 multi_4.c
module load MPI/Intel/MPICH/3.2-icc14-dyn
Yhrun -p paratera -N 3 -n 64 -t 20 multi_4