基于MPI和CUDA的三维各向同性二阶声波方程有限差分地震正演模拟（差分部分来自Madagascar）

最新推荐文章于 2021-11-11 01:10:30 发布

rtoax

最新推荐文章于 2021-11-11 01:10:30 发布

阅读量2.3k

点赞数 8

分类专栏：【语言与编译】【高性能计算】

本文链接：https://blog.csdn.net/Rong_Toa/article/details/77896552

版权

【语言与编译】同时被 2 个专栏收录

456 篇文章 19 订阅

订阅专栏

【高性能计算】

48 篇文章 11 订阅

订阅专栏

/*
This is how to configure lines :

#mpicc -o a test.c -lcudart -L/usr/local/cuda-7.5/lib64 -I/usr/local/cuda-7.5/include

nvcc -o a Toa_gpu_3dFD.cu -I/home/leonvel/software/mpi/mpich/include -L/home/leonvel/software/mpi/mpich/lib -lmpi
                    #-I/usr/local/openmpi/include -L/usr/local/openmpi/lib -L/usr/local/openmpi/lib/openmpi -lmpi \
                   #  -lcudart -L/usr/local/cuda-7.5/lib64 -I/usr/local/cuda-7.5/include


*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>

#include <mpi.h>

#define PI 	3.141592653

#define BlockSize1 16// tile size in 1st-axis
#define BlockSize2 16// tile size in 2nd-axis
#define mm      4    // half of the order in space
#define npd     50   // absorbing boundry condition wield

void check_gpu_error (const char *msg) 
/*< check GPU errors >*/
{
    cudaError_t err = cudaGetLastError ();
    if (cudaSuccess != err) { 
	printf("Cuda error: %s: %s\n", msg, cudaGetErrorString (err)); 
	exit(0);   
    }
}

__constant__ float stencil[mm+1]={-205.0/72.0,8.0/5.0,-1.0/5.0,8.0/315.0,-1.0/560.0};

__global__ void cuda_ricker_wavelet(float *wlt, float fm, float dt, int nt)
/*< generate ricker wavelet with time deley >*/
{
	int it=threadIdx.x+blockDim.x*blockIdx.x;
    	if (it<nt){
	  float tmp = PI*fm*fabsf(it*dt-1.0/fm);//delay the wavelet to exhibit all waveform
	  tmp *=tmp;
	  wlt[it]= (1.0-2.0*tmp)*expf(-tmp);// ricker wavelet at time: t=nt*dt
	}
}

__global__ void cuda_set_s(int *szxy, int szbeg, int sxbeg, int sybeg, int jsz, int jsx, int jsy, int ns, int nz, int nx, int ny)
/*< set the positions of sources  in whole domain >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
	int nnz=nz+2*mm+2*npd;
	int nnx=nx+2*mm+2*npd;
    	if (id<ns) szxy[id]=(szbeg+id*jsz+mm+npd)+nnz*(sxbeg+id*jsx+mm+npd)+nnz*nnx*(sybeg+id*jsy+mm+npd);
}

__global__ void cuda_set_g(int *gzxy, int ng, int nz, int nx, int ny)
/*< set the positions of  geophones in whole domain >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
	int nnz=nz+2*mm+2*npd;
	int nnx=nx+2*mm+2*npd;
       int iy=id/nx;
       int ix=id%nx;
    	if (id<ng) gzxy[id]=(mm+npd)+nnz*(ix*1+mm+npd)+nnz*nnx*(iy*1+mm+npd);
       
}
__global__ void cuda_trans_xy2txy(float *xy, float *txy, int it, int nt, int ng)
/*< set the positions of  geophones in whole domain >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
    	if (id<ng) txy[it+id*nt]+=xy[id];
       
}

__global__ void cuda_absorb_bndr(float *d_p,int nz,int nx,int ny,float qp)
/*< absorb boundry condition >*/
{
    const int iz = blockIdx.x * blockDim.x + threadIdx.x;//0--nz's thread:iz
    const int ix = blockIdx.y * blockDim.y + threadIdx.y;//0--nx's thread:ix

       int id,iy;
	int nnz=nz+2*mm+2*npd;
	int nnx=nx+2*mm+2*npd;
	int nny=ny+2*mm+2*npd;

       for(iy=0;iy<nny;iy++)
        {
          id=iz+ix*nnz+iy*nnz*nnx;
            /*< front & back (0<y<ny) >*/
             if ( iy < npd )
               d_p[id]=( qp*pow((npd-iy)/(1.0*npd),2) + 1 )*d_p[id];
             else if ( iy >= 2*mm + npd + ny )
               d_p[id]=( qp*pow((iy-2*mm-npd-ny)/(1.0*npd),2) + 1 )*d_p[id];
            /*< left & right (0<x<nx) >*/
             if ( ix < npd )
               d_p[id]=( qp*pow((npd-ix)/(1.0*npd),2) + 1 )*d_p[id];
             else if ( ix >= 2*mm + npd + nx )
               d_p[id]=( qp*pow((ix-2*mm-npd-nx)/(1.0*npd),2) + 1 )*d_p[id];
            /*< up & down (0<z<nz) >*/
             if ( iz < npd )
               d_p[id]=( qp*pow((npd-iz)/(1.0*npd),2) + 1 )*d_p[id];
             else if ( iz >= 2*mm + npd + nz )
               d_p[id]=( qp*pow((iz-2*mm-npd-nz)/(1.0*npd),2) + 1 )*d_p[id];
        }
       
}
__global__ void cuda_record(float *p, float *seis, int *gxz, int ng)//++++++++++++
/*< record the seismogram at time it >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
    	if (id<ng) seis[id]=p[gxz[id]];
}


__global__ void cuda_add_source(bool add, float *p, float *source, int *szxy, int ns)
/*< add/subtract sources: length of source[]=ns, index stored in szxy[] >*/
{
  int id=threadIdx.x+blockIdx.x*blockDim.x;

  if(id<ns){
    if(add){
      p[szxy[id]]+=source[id];
    }else{
      p[szxy[id]]-=source[id];
    }
  }
}

__global__ void cuda_step_fd3d(float *p0, float *p1, float *vv, float _dz2, float _dx2, float _dy2, int n1, int n2, int n3)
/*< step forward: 3-D FD, order=8 >*/
{
    bool validr = true;
    bool validw = true;
    const int gtid1 = blockIdx.x * blockDim.x + threadIdx.x;//0--nz's thread:iz
    const int gtid2 = blockIdx.y * blockDim.y + threadIdx.y;//0--nx's thread:ix
    const int ltid1 = threadIdx.x;//ithreadz
    const int ltid2 = threadIdx.y;//ithreadx
    const int work1 = blockDim.x;//nblockz
    const int work2 = blockDim.y;//nblockx
    __shared__ float tile[BlockSize2 + 2 * mm][BlockSize1 + 2 * mm];//tile[16+2*mm][16+2*mm]

    const int stride2 = n1 + 2 * mm + 2 * npd;//n1=nz
    const int stride3 = stride2 * (n2 + 2 * mm + 2 * npd);//n2=nx   stride3=(nz+2*mm)*(nx+2*mm)

    int inIndex = 0;
    int outIndex = 0;

    // Advance inputIndex to start of inner volume
    inIndex += (mm ) * stride2 + mm ;// inIndex=mm*(nz+2*mm+2*npd)+mm;

    // Advance inputIndex to target element
    inIndex += gtid2 * stride2 + gtid1; // inIndex=mm*(nz+2*mm)+mm+ix*(nz+2*mm+2*npd)+iz;:igrid

    float infront[mm];
    float behind[mm];
    float current;

    const int t1 = ltid1 + mm;
    const int t2 = ltid2 + mm;

    // Check in bounds
    if ((gtid1 >= n1 + mm + 2*npd) ||(gtid2 >= n2 + mm + 2*npd)) validr = false;
    if ((gtid1 >= n1 + 2*npd) ||(gtid2 >= n2 + 2*npd)) validw = false;

    // Preload the "infront" and "behind" data
    for (int i = mm -2 ; i >= 0 ; i--)//change 'mm-2' to 'mm-1'+++++++++++++++++++
    {
        if (validr) behind[i] = p1[inIndex];
        inIndex += stride3;//stride3=(nz+2*mm)*(nx+2*mm)
    }

    if (validr)	current = p1[inIndex];

    outIndex = inIndex;
    inIndex += stride3;//stride3=(nz+2*mm)*(nx+2*mm)

    for (int i = 0 ; i < mm ; i++)
    {
	if (validr) infront[i] = p1[inIndex];
        inIndex += stride3;//stride3=(nz+2*mm)*(nx+2*mm)
    }

    // Step through the zx-planes

    for (int i3 = mm ; i3 < n3 + 2*npd + mm ; i3++)
    {
        // Advance the slice (move the thread-front)
        for (int i = mm - 1 ; i > 0 ; i--) behind[i] = behind[i - 1];

        behind[0] = current;
        current = infront[0];

        for (int i = 0 ; i < mm - 1 ; i++) infront[i] = infront[i + 1];

        if (validr) infront[mm - 1] = p1[inIndex];

        inIndex += stride3;
        outIndex += stride3;
        __syncthreads();

        // Update the data slice in the local tile
        // Halo above & below
        if (ltid2 < mm)
        {
          /*   tile[ithread][ithread+mm]=p1[igrid - mm*(nz+2*mm)]  */
            tile[ltid2][t1]                  = p1[outIndex - mm * stride2];//t1 = ltid1 + mm;
            tile[ltid2 + work2 + mm][t1] = p1[outIndex + work2 * stride2];
        }

        // Halo left & right
        if (ltid1 < mm)
        {
            tile[t2][ltid1]                  = p1[outIndex - mm];
            tile[t2][ltid1 + work1 + mm] = p1[outIndex + work1];
        }

        tile[t2][t1] = current;
        __syncthreads();

        // Compute the output value
		float c1, c2, c3;
			c1=c2=c3=stencil[0]*current;        

        for (int i=1; i <= mm ; i++)
        {
			c1 +=stencil[i]*(tile[t2][t1-i]+ tile[t2][t1+i]);//z
			c2 +=stencil[i]*(tile[t2-i][t1]+ tile[t2+i][t1]);//x
			c3 +=stencil[i]*(infront[i-1]  + behind[i-1]  ); //y
        }
			c1*=_dz2;	
			c2*=_dx2;
			c3*=_dy2;
	
	
        if (validw) p0[outIndex]=2.0*p1[outIndex]-p0[outIndex]+vv[outIndex]*(c1+c2+c3);
    }

}

void velocity_transform(float *v0, float*vv, float dt, int n1, int n2, int n3)
 /*< velocit2 transform: vv=v0*dt; vv<--vv^2 >*/
{
  int i1, i2, i3, nn1, nn2, nn3;
  float tmp;

  nn1=n1+2*mm+2*npd;
  nn2=n2+2*mm+2*npd;
  nn3=n3+2*mm+2*npd;

  // inner zone
  for(i3=0; i3<n3; i3++){//y
    for(i2=0; i2<n2; i2++){//x
      for(i1=0; i1<n1; i1++){//z
	tmp=v0[i1+n1*i2+n1*n2*i3]*dt;
	vv[(i1+mm+npd)+nn1*(i2+mm+npd)+nn1*nn2*(i3+mm+npd)]=tmp*tmp;
      }
    }
  }  
    //top & down 
    for(i3=0; i3<nn3; i3++){//y
	for(i2=0; i2<nn2; i2++){//x
	    for (i1=0; i1<mm+npd; i1++){//z
		vv[i1+nn1*i2+nn1*nn2*i3]=vv[mm+npd+nn1*i2+nn1*nn2*i3];
		vv[(nn1-i1-1)+nn1*i2+nn1*nn2*i3]=vv[(nn1-mm-npd-1)+nn1*i2+nn1*nn2*i3];
	    }
	}
    }

    //left & right
    for(i3=0; i3<nn3; i3++){//y
	for(i2=0; i2<mm+npd; i2++){//x
	    for (i1=0; i1<nn1; i1++){//z
		vv[i1+nn1*i2+nn1*nn2*i3]=vv[i1+nn1*(mm+npd)+nn1*nn2*i3];
		vv[i1+nn1*(nn2-i2-1)+nn1*nn2*i3]=vv[i1+nn1*(nn2-mm-npd-1)+nn1*nn2*i3];
	    }
	}
    }
    //front & back
    for(i3=0; i3<mm+npd; i3++){//y
	for(i2=0; i2<nn2; i2++){//x
	    for(i1=0; i1<nn1; i1++){//z
		vv[i1+nn1*i2+nn1*nn2*i3]=vv[i1+nn1*i2+nn1*nn2*(mm+npd)];
		vv[i1+nn1*i2+nn1*nn2*(nn3-1-i3)]=vv[i1+nn1*i2+nn1*nn2*(nn3-mm-npd-1)];
	    }
	}
    }
}


void window3d(float *a, float *b, int n1, int n2, int n3)
/*< window a 3d subvolume >*/
{
	int i1, i2, i3, nn1, nn2;
	nn1=n1+2*mm+ 2*npd;//z
	nn2=n2+2*mm+ 2*npd;//x
	
	for(i3=0; i3<n3; i3++)
	for(i2=0; i2<n2; i2++)
	for(i1=0; i1<n1; i1++)
	{
		a[i1+n1*i2+n1*n2*i3]=b[(i1+mm+npd)+nn1*(i2+mm+npd)+nn1*nn2*(i3+mm+npd)];
	}
}


int main(int argc, char* argv[])
{

	int rank, size, nz, nx, ny, nnz, nnx, nny, ns, nt, kt, it, is, szbeg, sxbeg, sybeg, jsz, jsx, jsy, ng;
	int *d_szxy,*d_gzxy;
	float dz, dx, dy, fm, dt, _dz2, _dx2, _dy2;
	float *v0, *vv, *d_wlt, *d_vv, *d_p0, *d_p1, *ptr;
       float *d_dcal,*d_dcal0,*d_dcal1;

	char FNvel[250]={"vel201202203.dat"};
       char FNsnap[250]={"snap201202203.dat"};
       char FNshot[250]={"shot.dat"};

       FILE *fpvel,*fpsnap,*fpshot;
       fpvel=fopen(FNvel,"rb");
       fpsnap=fopen(FNsnap,"wb");
       fpshot=fopen(FNshot,"wb");
       

    	nz=201;
    	nx=202;
    	ny=203;
    	dz=10;
    	dx=10;
    	dy=10;
   	nt=1001;
	/* total number of time steps */
    	kt=600;
	/* record wavefield at time kt */
   	dt=0.001;
	/* time sampling interval */
   	fm=20;
	/* dominant frequency of Ricker wavelet */
   	ns=2;
	/* number of sources */
	szbeg=1;
	/* source beginning of z-axis */
	sxbeg=100;
	/* source beginning of x-axis */
	sybeg=100;
	/* source beginning of y-axis */
	jsz=0;
	/* source jump interval in z-axis */
	jsx=50;
	/* source jump interval in x-axis */
	jsy=50;
	/* source jump interval in y-axis */





	_dz2=1.0/(dz*dz);
	_dx2=1.0/(dx*dx);
	_dy2=1.0/(dy*dy);

	nnz=nz+2*mm+2*npd;
	nnx=nx+2*mm+2*npd;
	nny=ny+2*mm+2*npd;
       //ng=nx*ny;//+++++++++++++
       ng=nx*ny;


	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
       printf("rank=%d, size=%d,\n",rank,size);
       MPI_Barrier(MPI_COMM_WORLD);


    	v0=(float*)malloc(nz*nx*ny*sizeof(float));
    	vv=(float*)malloc(nnz*nnx*nny*sizeof(float));
    	d_dcal1=(float*)malloc(ng*nt*sizeof(float));//++++++++++

	fread(v0, sizeof(float), nz*nx*ny, fpvel);// read velocit2 model v0
	velocity_transform(v0, vv, dt, nz, nx, ny);// init

    	cudaSetDevice(0);// initialize device, default device=0;
	check_gpu_error("Failed to initialize device!");

	dim3 dimg, dimb;
	dimg.x=(nz+2*npd+2*mm+BlockSize1-1)/BlockSize1;//BlockSize1=16;
	dimg.y=(nx+2*npd+2*mm+BlockSize2-1)/BlockSize2;//BlockSize2=16;
	dimb.x=BlockSize1;
	dimb.y=BlockSize2;

	/* allocate memory on device */
	cudaMalloc(&d_wlt, nt*sizeof(float));
	cudaMalloc(&d_vv, nnz*nnx*nny*sizeof(float));
	cudaMalloc(&d_p0, nnz*nnx*nny*sizeof(float));
	cudaMalloc(&d_p1, nnz*nnx*nny*sizeof(float));
	cudaMalloc(&d_szxy, ns*sizeof(int));
	cudaMalloc(&d_gzxy, ng*sizeof(int));//++++++++++
	cudaMalloc(&d_dcal, ng*sizeof(float));	/* calculated/synthetic seismic data *///++++++++++++++
	cudaMalloc(&d_dcal0, ng*nt*sizeof(float)); //++++++++++++++


	check_gpu_error("Failed to allocate memory for variables!");

	cuda_ricker_wavelet<<<(nt+511)/512, 512>>>(d_wlt, fm, dt, nt);
	cudaMemcpy(d_vv, vv, nnz*nnx*nny*sizeof(float), cudaMemcpyHostToDevice);
	cuda_set_s<<<1, ns>>>(d_szxy, szbeg, sxbeg, sybeg, jsz, jsx, jsy, ns, nz, nx, ny);
	cuda_set_g<<<(ng+511)/512,512>>>(d_gzxy, ng, nz, nx, ny);//++++++++++++geophone

	float mstimer;
	clock_t t0, t1;
	cudaEvent_t start, stop;
  	cudaEventCreate(&start);	
	cudaEventCreate(&stop);

       MPI_Barrier(MPI_COMM_WORLD);
	for(is=rank; is<ns; is+=size){
	  cudaEventRecord(start);

	  cudaMemset(d_p0, 0, nnz*nnx*nny*sizeof(float));
	  cudaMemset(d_p1, 0, nnz*nnx*nny*sizeof(float));
	  cudaMemset(d_dcal, 0, ng*sizeof(float));//++++++++++
	  cudaMemset(d_dcal0, 0, ng*nt*sizeof(float));//++++++++++



	  for(it=0; it<nt; it++){
	    cuda_add_source<<<1,1>>>(true, d_p1, &d_wlt[it], &d_szxy[is], 1);
	    cuda_step_fd3d<<<dimg,dimb>>>(d_p0, d_p1, d_vv, _dz2, _dx2, _dy2, nz, nx, ny);//<<<[(nz+16-1)/16,(nx+16-1)/16],[16,16]>>>
	    ptr=d_p0; d_p0=d_p1; d_p1=ptr;//toggle buffers

           cuda_absorb_bndr<<<dimg,dimb>>>(d_p0, nz, nx, ny, -0.25);
           cuda_absorb_bndr<<<dimg,dimb>>>(d_p1, nz, nx, ny, -0.25);

	    cuda_record<<<(ng+511)/512, 512>>>(d_p0, d_dcal, d_gzxy, ng);//++++++++++
           cuda_trans_xy2txy<<<(ng+511)/512, 512>>>(d_dcal, d_dcal0, it, nt, ng);



/*	    if(it==kt){
	      t0 = clock();

	      cudaMemcpy(vv, d_p0, nnz*nnx*nny*sizeof(float), cudaMemcpyDeviceToHost);
	      window3d(v0, vv, nz, nx, ny);
	      fwrite(v0, sizeof(float),nz*nx*ny, fpsnap);	  

 	      t1 = clock();
 	      printf("save the volume: %f (s)\n", ((float)(t1-t0))/CLOCKS_PER_SEC); 	
	    }*/

          if(it%100==0)
	     printf("is=%2d, it=%d\n",is,it);
	  }
           MPI_Barrier(MPI_COMM_WORLD);
	    cudaMemcpy(d_dcal1, d_dcal0, ng*nt*sizeof(float), cudaMemcpyDeviceToHost);
           fseek(fpshot,is*ng*nt*sizeof(float),0);
	    fwrite(d_dcal1, sizeof(float), ng*nt, fpshot);

	  cudaEventRecord(stop);
          cudaEventSynchronize(stop);
  	  cudaEventElapsedTime(&mstimer, start, stop);
    	  printf("%d shot finished: %g (s)\n",is+1, mstimer*1.e-3);
	}
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	/* free memory on device */
	cudaFree(d_wlt);
	cudaFree(d_vv);
	cudaFree(d_p0);
	cudaFree(d_p1);
	cudaFree(d_szxy);
	cudaFree(d_gzxy);
	cudaFree(d_dcal);//+++++++++++
	cudaFree(d_dcal0);//+++++++++++
	free(v0);
	free(vv);
	free(d_dcal1);

/******************MPI************************/
   MPI_Finalize();

    	exit (0);
}

当然，我们可以将其分为两个文件，然后分别编译，内容如下（三个文件，一个mpi文件，一个cuda文件，一个Makefile文件）：

CUDA_INSTALL_PATH = /usr/local/cuda-7.5
MPI_INSTALL_PATH = /home/leonvel/software/mpi/mpich

NVCC = $(CUDA_INSTALL_PATH)/bin/nvcc
MPICC = $(MPI_INSTALL_PATH)/bin/mpicc

LDFLAGS = -L$(CUDA_INSTALL_PATH)/lib64
LIB = -lcudart -lcurand

CFILES = Toa_3dfd_mpi.c
CUFILES = Toa_3dfd_cuda.cu
OBJECTS = Toa_3dfd_mpi.o Toa_3dfd_cuda.o 
EXECNAME = fd

#CFILES = test.c
#CUFILES = test_cuda.cu
#OBJECTS = test.o test_cuda.o 
#EXECNAME = test

all:
	$(MPICC) -c $(CFILES)
	$(NVCC) -c $(CUFILES)
	$(MPICC) -o $(EXECNAME) $(LDFLAGS) $(LIB) $(OBJECTS)

	rm -f *.o *~

MPI文件：

//a#########################################################
//a##    3D iso acoustic fd :MPI + CUDA 
//a##                       code by Rong Tao
//a#########################################################
#include<stdio.h>
#include<malloc.h>
#include<math.h>
#include<stdlib.h>
#include "mpi.h"

void cuda_3dfd(FILE *fpvel, FILE *fpsnap, FILE *fpshot, int is, int ns, int myid, int nx, int ny, int nz, 
              float dx, float dy, float dz, int sxbeg, int sybeg, int szbeg, int jsx, int jsy, int jsz, 
              int nt, int kt, float dt, float fm);

main(int argc,char *argv[])
{

	int myid, numprocs;
	int is, ns, nx, ny, nz, szbeg, sxbeg, sybeg, jsz, jsx, jsy, nt, kt;
       float dx, dy, dz, dt, fm;

       clock_t t0, t1;


	char FNvel[250]={"vel101102103.dat"};
       char FNsnap[250]={"snap.dat"};
       char FNshot[250]={"shot.dat"};

       FILE *fpvel,*fpsnap,*fpshot;
       fpsnap=fopen(FNsnap,"wb");
       fpshot=fopen(FNshot,"wb+");


    	nx=101;   dx=5;   sxbeg=10;   jsx=9;
    	ny=102;   dy=5;   sybeg=10;   jsy=9;
    	nz=103;   dz=5;   szbeg=1;    jsz=0;
    	
       nt=401;kt=300;dt=0.0005;

   	fm=40;	

   	ns=4;


/*******************************************/
      MPI_Init(&argc,&argv);
      MPI_Comm_rank(MPI_COMM_WORLD,&myid);
      MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
      MPI_Barrier(MPI_COMM_WORLD);

     // numprocs=4;

      if(myid==0)t0 = clock();
   
      for(is=myid;is<ns;is=is+numprocs)
      {
       fpvel=fopen(FNvel,"rb");

       if(myid==0){printf("######################\n");printf("MPIforward: is==%d\n",is);}
       cuda_3dfd(fpvel, fpsnap, fpshot, is, ns, myid, nx, ny, nz, dx, dy, dz, 
                         sxbeg, sybeg, szbeg, jsx, jsy, jsz, nt, kt, dt, fm);
       fclose(fpvel);
      }

     if(myid==0)t1 = clock();
     if(myid==0)printf("####### MPI_Totally: %f (s)\n", ((float)(t1-t0)/1000000.0)); 

     MPI_Barrier(MPI_COMM_WORLD);


     fclose(fpsnap);
     fclose(fpshot);


     MPI_Finalize();
}

cuda文件：

//a#########################################################
//a##    3D iso acoustic fd :MPI + CUDA 
//a##                       code by Rong Tao
//a#########################################################
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cuda.h>

__device__ volatile int vint = 0;

#define PI 	3.141592653

#define BlockSize1 16// tile size in 1st-axis
#define BlockSize2 16// tile size in 2nd-axis
#define mm      4    // half of the order in space
#define npd     50   // absorbing boundry condition wield


void check_gpu_error (const char *msg) 
/*< check GPU errors >*/
{
    cudaError_t err = cudaGetLastError ();
    if (cudaSuccess != err) { 
	printf("Cuda error: %s: %s\n", msg, cudaGetErrorString (err)); 
	exit(0);   
    }
}

__constant__ float stencil[mm+1]={-205.0/72.0,8.0/5.0,-1.0/5.0,8.0/315.0,-1.0/560.0};

__global__ void cuda_ricker_wavelet(float *wlt, float fm, float dt, int nt)
/*< generate ricker wavelet with time deley >*/
{
	int it=threadIdx.x+blockDim.x*blockIdx.x;
    	if (it<nt){
	  float tmp = PI*fm*fabsf(it*dt-1.0/fm);//delay the wavelet to exhibit all waveform
	  tmp *=tmp;
	  wlt[it]= (1.0-2.0*tmp)*expf(-tmp);// ricker wavelet at time: t=nt*dt
	}
}

__global__ void cuda_set_s(int *szxy, int szbeg, int sxbeg, int sybeg, int jsz, int jsx, int jsy, int ns, int nz, int nx, int ny)
/*< set the positions of sources  in whole domain >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
	int nnz=nz+2*mm+2*npd;
	int nnx=nx+2*mm+2*npd;
    	if (id<ns) szxy[id]=(szbeg+id*jsz+mm+npd)+nnz*(sxbeg+id*jsx+mm+npd)+nnz*nnx*(sybeg+id*jsy+mm+npd);
}

__global__ void cuda_set_g(int *gzxy, int ng, int nz, int nx, int ny)
/*< set the positions of  geophones in whole domain >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
	int nnz=nz+2*mm+2*npd;
	int nnx=nx+2*mm+2*npd;
       int iy=id/nx;
       int ix=id%nx;
    	if (id<ng) gzxy[id]=(mm+npd)+nnz*(ix*1+mm+npd)+nnz*nnx*(iy*1+mm+npd);
       
}
__global__ void cuda_trans_xy2txy(float *xy, float *txy, int it, int nt, int ng)
/*< set the positions of  geophones in whole domain >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
    	if (id<ng) txy[it+id*nt]+=xy[id];
       
}

__global__ void cuda_absorb_bndr(float *d_p,int nz,int nx,int ny,float qp)
/*< absorb boundry condition >*/
{
    const int iz = blockIdx.x * blockDim.x + threadIdx.x;//0--nz's thread:iz
    const int ix = blockIdx.y * blockDim.y + threadIdx.y;//0--nx's thread:ix

       int id,iy;
	int nnz=nz+2*mm+2*npd;
	int nnx=nx+2*mm+2*npd;
	int nny=ny+2*mm+2*npd;

       for(iy=0;iy<nny;iy++)
        {
          id=iz+ix*nnz+iy*nnz*nnx;
            /*< front & back (0<y<ny) >*/
             if ( iy < npd )
               d_p[id]=( qp*pow((npd-iy)/(1.0*npd),2) + 1 )*d_p[id];
             else if ( iy >= 2*mm + npd + ny )
               d_p[id]=( qp*pow((iy-2*mm-npd-ny)/(1.0*npd),2) + 1 )*d_p[id];
            /*< left & right (0<x<nx) >*/
             if ( ix < npd )
               d_p[id]=( qp*pow((npd-ix)/(1.0*npd),2) + 1 )*d_p[id];
             else if ( ix >= 2*mm + npd + nx )
               d_p[id]=( qp*pow((ix-2*mm-npd-nx)/(1.0*npd),2) + 1 )*d_p[id];
            /*< up & down (0<z<nz) >*/
             if ( iz < npd )
               d_p[id]=( qp*pow((npd-iz)/(1.0*npd),2) + 1 )*d_p[id];
             else if ( iz >= 2*mm + npd + nz )
               d_p[id]=( qp*pow((iz-2*mm-npd-nz)/(1.0*npd),2) + 1 )*d_p[id];
        }
       
}
__global__ void cuda_record(float *p, float *seis, int *gxz, int ng)//++++++++++++
/*< record the seismogram at time it >*/
{
	int id=threadIdx.x+blockDim.x*blockIdx.x;
    	if (id<ng) seis[id]=p[gxz[id]];
}


__global__ void cuda_add_source(bool add, float *p, float *source, int *szxy, int ns)
/*< add/subtract sources: length of source[]=ns, index stored in szxy[] >*/
{
  int id=threadIdx.x+blockIdx.x*blockDim.x;

  if(id<ns){
    if(add){
      p[szxy[id]]+=source[id];
    }else{
      p[szxy[id]]-=source[id];
    }
  }
}

__global__ void cuda_step_fd3d(float *p0, float *p1, float *vv, float _dz2, float _dx2, float _dy2, int n1, int n2, int n3)
/*< step forward: 3-D FD, order=8 >*/
{
    bool validr = true;
    bool validw = true;
    const int gtid1 = blockIdx.x * blockDim.x + threadIdx.x;//0--nz's thread:iz
    const int gtid2 = blockIdx.y * blockDim.y + threadIdx.y;//0--nx's thread:ix
    const int ltid1 = threadIdx.x;//ithreadz
    const int ltid2 = threadIdx.y;//ithreadx
    const int work1 = blockDim.x;//nblockz
    const int work2 = blockDim.y;//nblockx
    __shared__ float tile[BlockSize2 + 2 * mm][BlockSize1 + 2 * mm];//tile[16+2*mm][16+2*mm]

    const int stride2 = n1 + 2 * mm + 2 * npd;//n1=nz
    const int stride3 = stride2 * (n2 + 2 * mm + 2 * npd);//n2=nx   stride3=(nz+2*mm)*(nx+2*mm)

    int inIndex = 0;
    int outIndex = 0;

    // Advance inputIndex to start of inner volume
    inIndex += (mm ) * stride2 + mm ;// inIndex=mm*(nz+2*mm+2*npd)+mm;

    // Advance inputIndex to target element
    inIndex += gtid2 * stride2 + gtid1; // inIndex=mm*(nz+2*mm)+mm+ix*(nz+2*mm+2*npd)+iz;:igrid

    float infront[mm];
    float behind[mm];
    float current;

    const int t1 = ltid1 + mm;
    const int t2 = ltid2 + mm;

    // Check in bounds
    if ((gtid1 >= n1 + mm + 2*npd) ||(gtid2 >= n2 + mm + 2*npd)) validr = false;
    if ((gtid1 >= n1 + 2*npd) ||(gtid2 >= n2 + 2*npd)) validw = false;

    // Preload the "infront" and "behind" data
    for (int i = mm -2 ; i >= 0 ; i--)//change 'mm-2' to 'mm-1'+++++++++++++++++++
    {
        if (validr) behind[i] = p1[inIndex];
        inIndex += stride3;//stride3=(nz+2*mm)*(nx+2*mm)
    }

    if (validr)	current = p1[inIndex];

    outIndex = inIndex;
    inIndex += stride3;//stride3=(nz+2*mm)*(nx+2*mm)

    for (int i = 0 ; i < mm ; i++)
    {
	if (validr) infront[i] = p1[inIndex];
        inIndex += stride3;//stride3=(nz+2*mm)*(nx+2*mm)
    }

    // Step through the zx-planes

    for (int i3 = mm ; i3 < n3 + 2*npd + mm ; i3++)
    {
        // Advance the slice (move the thread-front)
        for (int i = mm - 1 ; i > 0 ; i--) behind[i] = behind[i - 1];

        behind[0] = current;
        current = infront[0];

        for (int i = 0 ; i < mm - 1 ; i++) infront[i] = infront[i + 1];

        if (validr) infront[mm - 1] = p1[inIndex];

        inIndex += stride3;
        outIndex += stride3;
        __syncthreads();

        // Update the data slice in the local tile
        // Halo above & below
        if (ltid2 < mm)
        {
          /*   tile[ithread][ithread+mm]=p1[igrid - mm*(nz+2*mm)]  */
            tile[ltid2][t1]                  = p1[outIndex - mm * stride2];//t1 = ltid1 + mm;
            tile[ltid2 + work2 + mm][t1] = p1[outIndex + work2 * stride2];
        }

        // Halo left & right
        if (ltid1 < mm)
        {
            tile[t2][ltid1]                  = p1[outIndex - mm];
            tile[t2][ltid1 + work1 + mm] = p1[outIndex + work1];
        }

        tile[t2][t1] = current;
        __syncthreads();

        // Compute the output value
		float c1, c2, c3;
			c1=c2=c3=stencil[0]*current;        

        for (int i=1; i <= mm ; i++)
        {
			c1 +=stencil[i]*(tile[t2][t1-i]+ tile[t2][t1+i]);//z
			c2 +=stencil[i]*(tile[t2-i][t1]+ tile[t2+i][t1]);//x
			c3 +=stencil[i]*(infront[i-1]  + behind[i-1]  ); //y
        }
			c1*=_dz2;	
			c2*=_dx2;
			c3*=_dy2;
	
	
        if (validw) p0[outIndex]=2.0*p1[outIndex]-p0[outIndex]+vv[outIndex]*(c1+c2+c3);
    }

}

void velocity_transform(float *v0, float*vv, float dt, int n1, int n2, int n3)
 /*< velocit2 transform: vv=v0*dt; vv<--vv^2 >*/
{
  int i1, i2, i3, nn1, nn2, nn3;
  float tmp;

  nn1=n1+2*mm+2*npd;
  nn2=n2+2*mm+2*npd;
  nn3=n3+2*mm+2*npd;

  // inner zone
  for(i3=0; i3<n3; i3++){//y
    for(i2=0; i2<n2; i2++){//x
      for(i1=0; i1<n1; i1++){//z
	tmp=v0[i1+n1*i2+n1*n2*i3]*dt;
	vv[(i1+mm+npd)+nn1*(i2+mm+npd)+nn1*nn2*(i3+mm+npd)]=tmp*tmp;
      }
    }
  }  
    //top & down 
    for(i3=0; i3<nn3; i3++){//y
	for(i2=0; i2<nn2; i2++){//x
	    for (i1=0; i1<mm+npd; i1++){//z
		vv[i1+nn1*i2+nn1*nn2*i3]=vv[mm+npd+nn1*i2+nn1*nn2*i3];
		vv[(nn1-i1-1)+nn1*i2+nn1*nn2*i3]=vv[(nn1-mm-npd-1)+nn1*i2+nn1*nn2*i3];
	    }
	}
    }

    //left & right
    for(i3=0; i3<nn3; i3++){//y
	for(i2=0; i2<mm+npd; i2++){//x
	    for (i1=0; i1<nn1; i1++){//z
		vv[i1+nn1*i2+nn1*nn2*i3]=vv[i1+nn1*(mm+npd)+nn1*nn2*i3];
		vv[i1+nn1*(nn2-i2-1)+nn1*nn2*i3]=vv[i1+nn1*(nn2-mm-npd-1)+nn1*nn2*i3];
	    }
	}
    }
    //front & back
    for(i3=0; i3<mm+npd; i3++){//y
	for(i2=0; i2<nn2; i2++){//x
	    for(i1=0; i1<nn1; i1++){//z
		vv[i1+nn1*i2+nn1*nn2*i3]=vv[i1+nn1*i2+nn1*nn2*(mm+npd)];
		vv[i1+nn1*i2+nn1*nn2*(nn3-1-i3)]=vv[i1+nn1*i2+nn1*nn2*(nn3-mm-npd-1)];
	    }
	}
    }
}


void window3d(float *a, float *b, int n1, int n2, int n3)
/*< window a 3d subvolume >*/
{
	int i1, i2, i3, nn1, nn2;
	nn1=n1+2*mm+ 2*npd;//z
	nn2=n2+2*mm+ 2*npd;//x
	
	for(i3=0; i3<n3; i3++)
	for(i2=0; i2<n2; i2++)
	for(i1=0; i1<n1; i1++)
	{
		a[i1+n1*i2+n1*n2*i3]=b[(i1+mm+npd)+nn1*(i2+mm+npd)+nn1*nn2*(i3+mm+npd)];
	}
}


extern "C"  void cuda_3dfd(FILE *fpvel, FILE *fpsnap, FILE *fpshot, int is, int ns, int myid,  int nx, int ny, int nz, 
                          float dx, float dy, float dz, int sxbeg, int sybeg, int szbeg, int jsx, int jsy, int jsz, 
                          int nt, int kt, float dt, float fm)
{
	int  nnz, nnx, nny, it, ng;
	int *d_szxy,*d_gzxy;
	float _dz2, _dx2, _dy2;
	float *v0, *vv, *d_wlt, *d_vv, *d_p0, *d_p1, *ptr;
       float *d_dcal_device_xy,*d_dcal_device_txy,*d_dcal_host;

	clock_t t0, t1;
	t0 = clock();

	_dz2=1.0/(dz*dz);
	_dx2=1.0/(dx*dx);
	_dy2=1.0/(dy*dy);

	nnz=nz+2*mm+2*npd;
	nnx=nx+2*mm+2*npd;
	nny=ny+2*mm+2*npd;

       ng=nx*ny;

    	v0=(float*)malloc(nz*nx*ny*sizeof(float));
    	vv=(float*)malloc(nnz*nnx*nny*sizeof(float));
    	d_dcal_host=(float*)malloc(ng*nt*sizeof(float));

	fread(v0, sizeof(float), nz*nx*ny, fpvel);
	velocity_transform(v0, vv, dt, nz, nx, ny);

    	cudaSetDevice(0);
	check_gpu_error("CUDA:Failed to initialize device!");

	dim3 dimg, dimb;
	dimg.x=(nz+2*npd+2*mm+BlockSize1-1)/BlockSize1;
	dimg.y=(nx+2*npd+2*mm+BlockSize2-1)/BlockSize2;
	dimb.x=BlockSize1;
	dimb.y=BlockSize2;

	/* allocate memory on device */
	cudaMalloc(&d_wlt, nt*sizeof(float));
	cudaMalloc(&d_vv, nnz*nnx*nny*sizeof(float));
	cudaMalloc(&d_p0, nnz*nnx*nny*sizeof(float));
	cudaMalloc(&d_p1, nnz*nnx*nny*sizeof(float));
	cudaMalloc(&d_szxy, ns*sizeof(int));
	cudaMalloc(&d_gzxy, ng*sizeof(int));
	cudaMalloc(&d_dcal_device_xy, ng*sizeof(float));	
	cudaMalloc(&d_dcal_device_txy, ng*nt*sizeof(float)); 

	check_gpu_error("CUDA: Failed to allocate memory for variables!");

	cuda_ricker_wavelet<<<(nt+511)/512, 512>>>(d_wlt, fm, dt, nt);
	cudaMemcpy(d_vv, vv, nnz*nnx*nny*sizeof(float), cudaMemcpyHostToDevice);
	cuda_set_s<<<1, ns>>>(d_szxy, szbeg, sxbeg, sybeg, jsz, jsx, jsy, ns, nz, nx, ny);
	cuda_set_g<<<(ng+511)/512,512>>>(d_gzxy, ng, nz, nx, ny);

	cudaMemset(d_p0, 0, nnz*nnx*nny*sizeof(float));
	cudaMemset(d_p1, 0, nnz*nnx*nny*sizeof(float));
	cudaMemset(d_dcal_device_xy, 0, ng*sizeof(float));
	cudaMemset(d_dcal_device_txy, 0, ng*nt*sizeof(float));


       if(myid==0)printf("   cuda: is=%2d << it= 0",is);
	for(it=0; it<nt; it++)
        {
	  cuda_add_source<<<1,1>>>(true, d_p1, &d_wlt[it], &d_szxy[is], 1);
	  cuda_step_fd3d<<<dimg,dimb>>>(d_p0, d_p1, d_vv, _dz2, _dx2, _dy2, nz, nx, ny);
	  ptr=d_p0; d_p0=d_p1; d_p1=ptr;

         cuda_absorb_bndr<<<dimg,dimb>>>(d_p0, nz, nx, ny, -0.25);
         cuda_absorb_bndr<<<dimg,dimb>>>(d_p1, nz, nx, ny, -0.25);

	  cuda_record<<<(ng+511)/512, 512>>>(d_p0, d_dcal_device_xy, d_gzxy, ng);
         cuda_trans_xy2txy<<<(ng+511)/512, 512>>>(d_dcal_device_xy, d_dcal_device_txy, it, nt, ng);

           /* if(it==kt&&is==0){
	      cudaMemcpy(vv, d_p0, nnz*nnx*nny*sizeof(float), cudaMemcpyDeviceToHost);
	      window3d(v0, vv, nz, nx, ny);
	      fwrite(v0, sizeof(float),nz*nx*ny, fpsnap);	  
	    }*/
          if(it%200==0&&myid==0) printf("-%d",it);
	  }

	cudaMemcpy(d_dcal_host, d_dcal_device_txy, ng*nt*sizeof(float), cudaMemcpyDeviceToHost);
       fseek(fpshot,is*ng*nt*sizeof(float),0);
	fwrite(d_dcal_host, sizeof(float), ng*nt, fpshot);

 	t1 = clock();
 	if(myid==0)printf(" >> %.3f (s)\n", ((float)(t1-t0))/CLOCKS_PER_SEC); 

	/* free memory on device */
	cudaFree(d_wlt);
	cudaFree(d_vv);
	cudaFree(d_p0);
	cudaFree(d_p1);
	cudaFree(d_szxy);
	cudaFree(d_gzxy);
	cudaFree(d_dcal_device_xy);
	cudaFree(d_dcal_device_txy);
	free(v0);
	free(vv);
	free(d_dcal_host);

}