MPI实现fft的迭代算法源于并行计算——结构。算法。编程中伪码更新2

最新推荐文章于 2021-12-03 13:26:59 发布

为啥强制昵称

最新推荐文章于 2021-12-03 13:26:59 发布

阅读量1.1k

点赞数

分类专栏：做并行文章标签：算法 fft 编程 file socket struct

本文链接：https://blog.csdn.net/pouloghost/article/details/8064664

版权

做并行专栏收录该内容

27 篇文章 1 订阅

订阅专栏

在实际使用中，更新1虽然看上去将计算和通信overlap了，但是，由于每次socket传输都会导致进入系统态，各种中断，更新cache，反而会很慢。而且，系统会对socket数量有限制。在我的实验环境中，以太网只允许开1k个socket，即单节点只能有1k个isend，这样严重妨碍了系统规模的扩大。所以，统一通信能获得更好的时间效果。更新通信方式。

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/stat.h>
#include <memory.h>


typedef struct {
	double real;
	double img;
} com;


double PI;


int readBinary(char* file,void *buf,int fsize);//if named read causes override and miscall
int writeBinary(char *file,com *array,int size);
//don't use the omega array,not every node needs a whole copy of omega,not efficient
static inline void cp(com f,com *t);//copy complex
static inline void add(com a,com b,com* r);
static inline void mul(com a,com b,com* r);
static inline void sub(com a,com b,com* r);
int br(int src,int size);//bit reverse
void send(com *c,com *r,int s,int t);
void show(com a) {
	printf("%.4f %.4f \n",a.real,a.img);
}
int main(int argc,char *argv[]) {
	if(argc<3) {
		printf("wtf\n");
		return 1;
	}
	double st,et;
	PI=atan(1)*4;
	int self,size;//process id and total number of processes
	MPI_Init(&argc,&argv);
	st=MPI_Wtime();
	MPI_Comm_rank(MPI_COMM_WORLD,&self);
	MPI_Comm_size(MPI_COMM_WORLD,&size);
	int fsize,n;
	void *buf;
	com *in;
	if(0==self) {
		//printf("start \n");
		struct stat fstat;
		stat(argv[1],&fstat);
		fsize=fstat.st_size;
		buf=malloc(fsize);
		n=readBinary(argv[1],buf,fsize)/2;//n stands for total complex number
	}
	MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD);//every thread should know the total size
	
	if(-1==n) {
		printf("error reading \n");
		MPI_Finalize();
	}
	in=(com*)malloc(n*sizeof(com));
	if(0==self) {
		memcpy(in,((int*)buf+1),n*sizeof(com));
		free(buf);
	}
	int psize=n/size;
	//data
	com w,m;//omega
	com t[psize];


	int l=log(n)/log(2);
	
	int off=self*psize;
	//initialize data each round
	MPI_Bcast(in,n*2,MPI_DOUBLE,0,MPI_COMM_WORLD);
	for(int h=l-1;h>=0;--h) {
		//calculate
		int p=pow(2,h);
		int q=n/p;
		int k;
		for(k=off;k<off+psize;++k) {
			if(k%p==k%(2*p)) {
				int time=p*(br(k,l)%q);
				w.real=cos(2*PI*time/n);
				w.img=sin(2*PI*time/n);
				mul(in[k+p],w,&m);
				add(in[k],m,&t[k-off]);
			} else {
				int time=p*(br(k-p,l)%q);
				w.real=cos(2*PI*time/n);
				w.img=sin(2*PI*time/n);
				mul(in[k],w,&m);
				sub(in[k-p],m,&t[k-off]);
			}
		}
		MPI_Allgather(t,2*psize,MPI_DOUBLE,in,2*psize,MPI_DOUBLE,MPI_COMM_WORLD);
	}
	//MPI_Bcast(in,n*2,MPI_DOUBLE,0,MPI_COMM_WORLD);
	//reverse all data
	int rs=0;
	for(int k=off;k<off+psize;++k) {//post all comunications first
		//tag is always the sending row number
		t[k-off]=in[br(k,l)];
	}
	MPI_Gather(t,psize*2,MPI_DOUBLE,in,psize*2,MPI_DOUBLE,0,MPI_COMM_WORLD);
	if(0==self) {
/*		for(int i=0;i<n;++i) {
			printf("b%d :%.4f %.4f \n",i,in[i].real,in[i].img);
		}*/
		writeBinary(argv[2],in,n);
	}
	free(in);
	et=MPI_Wtime();
	MPI_Finalize();
	printf("%f \n",et-st);
	return 0;
}


int readBinary(char* file,void *buf,int fsize) {
	FILE *in;
	if(!(in=fopen(file,"r"))) {
		printf("can't open \n");
		return -1;
	}


	fread(buf,sizeof(char),fsize,in);
	int size=((int*)buf)[0];
	fclose(in);
	return size;
}


int writeBinary(char *file,com *array,int size) {
	FILE *out;
	if(!(out=fopen(file,"w"))) {
		printf("can't open \n");
		return -1;
	}
	int bsize=sizeof(int)+size*sizeof(com);
	void *buf=malloc(bsize);
	((int*)buf)[0]=2*size;
	memcpy(((int*)buf+1),array,size*sizeof(com));
	fwrite(buf,sizeof(char),bsize,out);
	free(buf);
	fclose(out);
	return 0;
}
void cp(com f,com *t) {
	t->real=f.real;
	t->img=f.img;
}
void add(com a,com b,com *c)   
{
	c->real=a.real+b.real;   
	c->img=a.img+b.img;   
}   
    
void mul(com a,com b,com *c)   
{   
	c->real=a.real*b.real-a.img*b.img;   
	c->img=a.real*b.img+a.img*b.real;   
}
void sub(com a,com b,com *c)   
{   
	c->real=a.real-b.real;   
	c->img=a.img-b.img;   
}   
int br(int src,int size) {
	int tmp=src;
	int des=0;
	for(int i=size-1;i>=0;--i) {
		des=((tmp&1)<<i)|des;
		tmp=tmp>>1;
	}
	return des;
}

特别是在infiniband下，这个算法要快上十几倍。

为啥强制昵称

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MPI实现fft的迭代算法源于并行计算——结构。算法。编程中伪码更新2

在实际使用中，更新1虽然看上去将计算和通信overlap了，但是，由于每次socket传输都会导致进入系统态，各种中断，更新cache，反而会很慢。而且，系统会对socket数量有限制。在我的实验环境中，以太网只允许开1k个socket，即单节点只能有1k个isend，这样严重妨碍了系统规模的扩大。所以，统一通信能获得更好的时间效果。更新通信方式。#include "mpi.h"#incl
复制链接

扫一扫