构造基于Pthreads的并行for循环分解、分配和执行机制
1) 基于pthreads的多线程库提供的基本函数,如线程创建、线程join、线程同步等。构建parallel_for函数对循环分解、分配和执行机制,函数参数包括但不限于(int start, int end, int increment, void *(functor)(void), void *arg , int num_threads);其中start为循环开始索引;end为结束索引;increment每次循环增加索引数;functor为函数指针,指向的需要被并行执行循环程序块;arg为functor的入口参数;num_threads为并行线程数。
2) 在Linux系统中将parallel_for函数编译为.so文件,由其他程序调用。
3) 将通用矩阵乘法的for循环,改造成基于parallel_for函数并行化的矩阵乘法,注意只改造可被并行执行的for循环(例如无race condition、无数据依赖、无循环依赖等)。
代码段:
parallel.c
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
#include<pthread.h>
#include<parallel.h>
int *A,*B,*C;
int M,time1,time2;
int thread_count;
struct for_index
{
int start;
int end;
int increment;
};
void* functor(void * argv){
struct for_index* index = (struct for_index*) argv;
int start = index->start;
int end = index->end;
int increment = index->increment;
for(int i = start;i <= end;i+=increment){
for(int j = 0;j < M;++j){
int temp = 0;
for(int k = 0;k < M;++k){
temp += A[i*M+k]*B[k*M+j];
}
C[i*M+j] = temp;
}
}
}
void parallel_for(int start, int end, int increment, void*(*functor)(void*), void *arg, int num_threads){
pthread_t p_thread[thread_count];
int counts = end-start;
int threads = num_threads;
if(num_threads >= counts) threads = counts;
int average_loop = counts/num_threads;
for(int i = 0;i < threads;++i){
struct for_index* index = (struct for_index*)malloc(sizeof(struct for_index));
index->start = average_loop*i;
index->end = increment;
if(i < threads-1){
index->end = average_loop*(i+1)-1;
}
else{
index->end = counts - 1;
}
pthread_create(&(p_thread[i]),NULL,functor,(void*)index);
}
for(int i = 0; i < threads;i++){
pthread_join(p_thread[i],NULL);
}
}
parallel.h
#ifndef _PARALLEL_FOR_H_
#define _PARALLEL_FOR_H_
#include<stdio.h>
void parallel_for(int start, int end, int increment, void*(*functor)(void*), void *arg, int num_threads);
void* functor(void * argv);
#endif
program.c
#include<stdlib.h>
#include<stdio.h>
#include<time.h>
#include<pthread.h>
#include<parallel.h>
int *A,*B,*C;
int M,time1,time2;
int thread_count;
// void print_matrix(int* A){
// for(int i = 0;i < M;++i){
// for(int j = 0;j < M;++j){
// printf("%d ",A[i*M+j]);
// }
// printf("\n");
// }
// }
struct for_index
{
int start;
int end;
int increment;
};
void* functor(void * argv){
struct for_index* index = (struct for_index*) argv;
int start = index->start;
int end = index->end;
int increment = index->increment;
for(int i = start;i <= end;i+=increment){
for(int j = 0;j < M;++j){
int temp = 0;
for(int k = 0;k < M;++k){
temp += A[i*M+k]*B[k*M+j];
}
C[i*M+j] = temp;
}
}
}
void parallel_for(int start, int end, int increment, void*(*functor)(void*), void *arg, int num_threads){
pthread_t p_thread[thread_count];
int counts = end-start;
int threads = num_threads;
if(num_threads >= counts) threads = counts;
int average_loop = counts/num_threads;
for(int i = 0;i < threads;++i){
struct for_index* index = (struct for_index*)malloc(sizeof(struct for_index));
index->start = average_loop*i;
index->end = increment;
if(i < threads-1){
index->end = average_loop*(i+1)-1;
}
else{
index->end = counts - 1;
}
pthread_create(&(p_thread[i]),NULL,functor,(void*)index);
}
for(int i = 0; i < threads;i++){
pthread_join(p_thread[i],NULL);
}
}
int main(int argc, char ** argv){
M = atoi(argv[1]);
thread_count = atoi(argv[2]);
A = (int*)malloc(sizeof(int)*M*M);
B = (int*)malloc(sizeof(int)*M*M);
C = (int*)malloc(sizeof(int)*M*M);
for(int i = 0;i < M*M;++i){
A[i] = rand()%5;
B[i] = rand()%5;
C[i] = 0;
}
time1 = clock();
parallel_for(0,M,1,functor,NULL,thread_count);
time2 = clock();
// printf("A is\n");
// print_matrix(A);
// printf("B is\n");
// print_matrix(B);
// printf("C is\n");
// print_matrix(C);
free(A);
free(B);
free(C);
printf("Time is:%.6f",(time2-time1)/CLK_TCK);
}