MPI并行程序编写与调试(奇偶变换排序)
一、编写MPI并行计算程序
- 奇偶排序在多线程进行计算代码如下:
关于奇偶排序这里不做过多描述。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>
const int RMAX = 100;
void Usage(char* program);
void Print_list(int local_A[], int local_n, int rank);
void Merge_low(int local_A[], int temp_B[], int temp_C[],
int local_n);
void Merge_high(int local_A[], int temp_B[], int temp_C[],
int local_n);
void Generate_list(int local_A[], int local_n, int my_rank);
int Compare(const void* a_p, const void* b_p);
void Get_args(int argc, char* argv[], int* global_n_p, int* local_n_p,
char* gi_p, int my_rank, int p, MPI_Comm comm);
void Sort(int local_A[], int local_n, int my_rank,
int p, MPI_Comm comm);
void Odd_even_iter(int local_A[], int temp_B[], int temp_C[],
int local_n, int phase, int even_partner, int odd_partner,
int my_rank, int p, MPI_Comm comm);
void Print_local_lists(int local_A[], int local_n,
int my_rank, int p, MPI_Comm comm);
void Print_global_list(int local_A[], int local_n, int my_rank,
int p, MPI_Comm comm);
void Read_list(int local_A[], int local_n, int my_rank, int p,
MPI_Comm comm);
int main(int argc, char* argv[]) {
int my_rank, p;
char g_i;
int *local_A;
int global_n;
int local_n;
MPI_Comm comm;
MPI_Init(&argc, &argv);
comm = MPI_COMM_WORLD;
MPI_Comm_size(comm, &p);
MPI_Comm_rank(comm, &my_rank);
Get_args(argc, argv, &global_n, &local_n, &g_i, my_rank, p, comm);
local_A = (int*) malloc(local_n*sizeof(int));
if (g_i == 'g') {
Generate_list(local_A, local_n, my_rank);
Print_local_lists(local_A, local_n, my_rank, p, comm);
} else {
Read_list(local_A, local_n, my_rank, p, comm);
# ifdef DEBUG
Print_local_lists(local_A, local_n, my_rank, p, comm);
# endif
}
# ifdef DEBUG
printf("Proc %d > Before Sort\n", my_rank);
fflush(stdout);
# endif
Sort(local_A, local_n, my_rank, p, comm);
# ifdef DEBUG
Print_local_lists(local_A, local_n, my_rank, p, comm);
fflush(stdout);
# endif
Print_global_list(local_A, local_n, my_rank, p, comm);
free(local_A);
MPI_Finalize();
return 0;
}
void Generate_list(int local_A[], int local_n, int my_rank) {
int i;
srandom(my_rank+1);
for (i = 0; i < local_n; i++)
local_A[i] = random() % RMAX;
}
void Usage(char* program) {
fprintf(stderr, "usage: mpirun -np <p> %s <g|i> <global_n>\n",
program);
fprintf(stderr, " - p: the number of processes \n");
fprintf(stderr, " - g: generate random, distributed list\n");
fprintf(stderr, " - i: user will input list on process 0\n");
fprintf(stderr, " - global_n: number of elements in global list");
fprintf(stderr, " (must be evenly divisible by p)\n");
fflush(stderr);
}
void Get_args(int argc, char* argv[], int* global_n_p, int* local_n_p,
char* gi_p, int my_rank, int p, MPI_Comm comm) {
if (my_rank == 0) {
if (argc != 3) {
Usage(argv[0]);
*global_n_p = -1;
} else {
*gi_p = argv[1][0];
if (*gi_p != 'g' && *gi_p != 'i') {
Usage(argv[0]);
*global_n_p = -1;
} else {
*global_n_p = atoi(argv[2]);
if (*global_n_p % p != 0) {
Usage(argv[0]);
*global_n_p = -1;
}
}
}
}
MPI_Bcast(gi_p, 1, MPI_CHAR, 0, comm);
MPI_Bcast(global_n_p, 1, MPI_INT, 0, comm);
if (*global_n_p <= 0) {
MPI_Finalize();
exit(-1);
}
*local_n_p = *global_n_p/p;
# ifdef DEBUG
printf("Proc %d > gi = %c, global_n = %d, local_n = %d\n",
my_rank, *gi_p, *global_n_p, *local_n_p);
fflush(stdout);
# endif
}
void Read_list(int local_A[], int local_n, int my_rank, int p,
MPI_Comm comm) {
int i;
int *temp;
if (my_rank == 0) {
temp = (int*) malloc(p*local_n*sizeof(int));
printf("Enter the elements of the list\n");
for (i = 0; i < p*local_n; i++)
scanf("%d", &temp[i]);
}
MPI_Scatter(temp, local_n, MPI_INT, local_A, local_n, MPI_INT,
0, comm);
if (my_rank == 0)
free(temp);
}
void Print_global_list(int local_A[], int local_n, int my_rank, int p,
MPI_Comm comm) {
int* A;
int i, n;
if (my_rank == 0) {
n = p*local_n;
A = (int*) malloc(n*sizeof(int));
MPI_Gather(local_A, local_n, MPI_INT, A, local_n, MPI_INT, 0,
comm);
printf("Global list:\n");
for (i = 0; i < n; i++)
printf("%d ", A[i]);
printf("\n\n");
free(A);
} else {
MPI_Gather(local_A, local_n, MPI_INT, A, local_n, MPI_INT, 0,
comm);
}
}
int Compare(const void* a_p, const void* b_p) {
int a = *((int*)a_p);
int b = *((int*)b_p);
if (a < b)
return -1;
else if (a == b)
return 0;
else
return 1;
}
void Sort(int local_A[], int local_n, int my_rank,
int p, MPI_Comm comm) {
int phase;
int *temp_B, *temp_C;
int even_partner;
int odd_partner;
temp_B = (int*) malloc(local_n*sizeof(int));
temp_C = (int*) malloc(local_n*sizeof(int));
if (my_rank % 2 != 0) {
even_partner = my_rank - 1;
odd_partner = my_rank + 1;
if (odd_partner == p) odd_partner = MPI_PROC_NULL;
} else {
even_partner = my_rank + 1;
if (even_partner == p) even_partner = MPI_PROC_NULL;
odd_partner = my_rank-1;
}
qsort(local_A, local_n, sizeof(int), Compare);
# ifdef DEBUG
printf("Proc %d > before loop in sort\n", my_rank);
fflush(stdout);
# endif
for (phase = 0; phase < p; phase++)
Odd_even_iter(local_A, temp_B, temp_C, local_n, phase,
even_partner, odd_partner, my_rank, p, comm);
free(temp_B);
free(temp_C);
}
void Odd_even_iter(int local_A[], int temp_B[], int temp_C[],
int local_n, int phase, int even_partner, int odd_partner,
int my_rank, int p, MPI_Comm comm) {
MPI_Status status;
if (phase % 2 == 0) {
if (even_partner >= 0) {
MPI_Sendrecv(local_A, local_n, MPI_INT, even_partner, 0,
temp_B, local_n, MPI_INT, even_partner, 0, comm,
&status);
if (my_rank % 2 != 0)
Merge_high(local_A, temp_B, temp_C, local_n);
else
Merge_low(local_A, temp_B, temp_C, local_n);
}
} else {
if (odd_partner >= 0) {
MPI_Sendrecv(local_A, local_n, MPI_INT, odd_partner, 0,
temp_B, local_n, MPI_INT, odd_partner, 0, comm,
&status);
if (my_rank % 2 != 0)
Merge_low(local_A, temp_B, temp_C, local_n);
else
Merge_high(local_A, temp_B, temp_C, local_n);
}
}
}
void Merge_low(
int my_keys[],
int recv_keys[],
int temp_keys[],
int local_n ) {
int m_i, r_i, t_i;
m_i = r_i = t_i = 0;
while (t_i < local_n) {
if (my_keys[m_i] <= recv_keys[r_i]) {
temp_keys[t_i] = my_keys[m_i];
t_i++; m_i++;
} else {
temp_keys[t_i] = recv_keys[r_i];
t_i++; r_i++;
}
}
memcpy(my_keys, temp_keys, local_n*sizeof(int));
}
void Merge_high(int local_A[], int temp_B[], int temp_C[],
int local_n) {
int ai, bi, ci;
ai = local_n-1;
bi = local_n-1;
ci = local_n-1;
while (ci >= 0) {
if (local_A[ai] >= temp_B[bi]) {
temp_C[ci] = local_A[ai];
ci--; ai--;
} else {
temp_C[ci] = temp_B[bi];
ci--; bi--;
}
}
memcpy(local_A, temp_C, local_n*sizeof(int));
}
void Print_list(int local_A[], int local_n, int rank) {
int i;
printf("%d: ", rank);
for (i = 0; i < local_n; i++)
printf("%d ", local_A[i]);
printf("\n");
}
void Print_local_lists(int local_A[], int local_n,
int my_rank, int p, MPI_Comm comm) {
int* A;
int q;
MPI_Status status;
if (my_rank == 0) {
A = (int*) malloc(local_n*sizeof(int));
Print_list(local_A, local_n, my_rank);
for (q = 1; q < p; q++) {
MPI_Recv(A, local_n, MPI_INT, q, 0, comm, &status);
Print_list(A, local_n, q);
}
free(A);
} else {
MPI_Send(local_A, local_n, MPI_INT, 0, 0, comm);
}
}
二、编译
三、多次运行结果
四、结论
- 一是在写代码调用多个核心多线程运算时,声明变量需编写在前面,否则会造成编译失败;二是按照实验原理来说,多核运算的运算时间肯定是要小于单核运算的,但是在很少的情况下,多核运算的耗时可能反而高于单核运算(如上图),这是因为如上图的情况下,比如生成100个随机数,不是相同的100个随机数,很可能多核运算需要排序的100个数字本身就需要更多的判断排序,所以可能会造成多核运算的耗时高,但是随着排序数量的增加,其多核运算的耗时明显会和单核拉开距离,所以如果数据量本身就少,可能会有个例,但是随着数据量的增加,多核运算的耗时明显会低于单核运算,通过对比图中排序数量高于200的耗时对比,说明理论是正确的,同时也要注意核心的分配,不要分配比硬件本身更高的核心,这样反而会降低速度。