并行计算aml.c_first nonblocking barriers are blocking,so we call-CSDN博客

/* Copyright (c) 2011-2017 Graph500 Steering Committee
   All rights reserved.
   Developed by:                Anton Korzh anton@korzh.us
                                Graph500 Steering Committee
                                http://www.graph500.org
   New code under University of Illinois/NCSA Open Source License
   see license.txt or https://opensource.org/licenses/NCSA
*/

// AML: active messages library v 1.0
// MPI-3 passive transport
// transparent message aggregation greatly increases message-rate for loosy interconnects
// shared memory optimization used
// Implementation basic v1.0

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <malloc.h>
#include <unistd.h>
#include <mpi.h>
#include <pthread.h>

#define MAXGROUPS 65536        //number of nodes (core processes form a group on a same node)
#define AGGR (1024*32) //aggregation buffer size per dest in bytes : internode
#define AGGR_intra (1024*32) //aggregation buffer size per dest in bytes : intranode
#define NRECV 4 // number of preposted recvs internode
#define NRECV_intra 4 // number of preposted recvs intranode
#define NSEND 4 // number of available sends internode
#define NSEND_intra 4 // number of send intranode
#define SOATTR __attribute__((visibility("default")))

#define SENDSOURCE(node) ( sendbuf+(AGGR*nbuf[node]))
#define SENDSOURCE_intra(node) ( sendbuf_intra+(AGGR_intra*nbuf_intra[node]) )

#define ushort unsigned short
static int myproc,num_procs;
static int mygroup,num_groups;
static int mylocal,group_size;
#ifndef PROCS_PER_NODE_NOT_POWER_OF_TWO
static int loggroup,groupmask;
#define PROC_FROM_GROUPLOCAL(g,l) ((l)+((g)<<loggroup))
#define GROUP_FROM_PROC(p) ((p) >> loggroup)
#define LOCAL_FROM_PROC(p) ((p) & groupmask)
#else
#define PROC_FROM_GROUPLOCAL(g,l) ((g)*group_size+(l))
#define GROUP_FROM_PROC(p) ((p)/group_size)
#define LOCAL_FROM_PROC(p) ((p)%group_size)
#endif
volatile static int ack=0;

volatile static int inbarrier=0;

static void (*aml_handlers[256]) (int,void *,int); //pointers to user-provided AM handlers

//internode comm (proc number X from each group)
//intranode comm (all cores of one nodegroup)
MPI_Comm comm, comm_intra;

// MPI stuff for sends
static char *sendbuf; //coalescing buffers, most of memory is allocated is here
static int *sendsize; //buffer occupacy in bytes
static ushort *acks; //aggregated acks
static ushort *nbuf; //actual buffer for each group/localcore
static ushort activebuf[NSEND];// N_buffer used in transfer(0..NSEND{_intra}-1)
static MPI_Request rqsend[NSEND];
// MPI stuff for recv
static char recvbuf[AGGR*NRECV];
static MPI_Request rqrecv[NRECV];

unsigned long long nbytes_sent,nbytes_rcvd;

static char *sendbuf_intra;
static int *sendsize_intra;
static ushort *acks_intra;
static ushort *nbuf_intra;
static ushort activebuf_intra[NSEND_intra];
static MPI_Request rqsend_intra[NSEND_intra];
static char recvbuf_intra[AGGR_intra*NRECV_intra];
static MPI_Request rqrecv_intra[NRECV_intra];
volatile static int ack_intra=0;
inline void aml_send_intra(void *srcaddr, int type, int length, int local ,int from);

void aml_finalize(void);
void aml_barrier(void);

SOATTR void aml_register_handler(void(*f)(int,void*,int),int n) { aml_barrier(); aml_handlers[n]=f; aml_barrier(); }

struct __attribute__((__packed__)) hdr { //header of internode message
    ushort sz;
    char hndl;
    char routing;
};
//process internode messages
static void process(int fromgroup,int length ,char* message) {
    int i = 0;
    int from = PROC_FROM_GROUPLOCAL(fromgroup,mylocal);
    while ( i < length ) {
        void* m = message+i;
        struct hdr *h = m;
        int hsz=h->sz;
        int hndl=h->hndl;
        int destlocal = LOCAL_FROM_PROC(h->routing);
        if(destlocal == mylocal)
            aml_handlers[hndl](from,m+sizeof(struct hdr),hsz);
        else
            aml_send_intra(m+sizeof(struct hdr),hndl,hsz,destlocal,from);
        i += hsz + sizeof(struct hdr);
    }
}
struct __attribute__((__packed__)) hdri { //header of internode message
    ushort routing;
    ushort sz;
    char hndl;
};

//process intranode messages
static void process_intra(int fromlocal,int length ,char* message) {
    int i=0;
    while ( i < length ) {
        void*m = message+i;
        struct hdri *h = m;
        int hsz=h->sz;
        int hndl=h->hndl;
        aml_handlers[hndl](PROC_FROM_GROUPLOCAL((int)(h->routing),fromlocal),m+sizeof(struct hdri),hsz);
        i += sizeof(struct hdri) + hsz;
    }
}

// poll intranode message
inline void aml_poll_intra(void) {
    int flag, from, length,index;
    MPI_Status status;
    MPI_Testany( NRECV_intra,rqrecv_intra, &index, &flag, &status );
    if ( flag ) {
        MPI_Get_count( &status, MPI_CHAR, &length );
        ack_intra -= status.MPI_TAG;
        if(length>0) { //no confirmation & processing for ack only messages
            from = status.MPI_SOURCE;
            if(inbarrier)
                MPI_Send(NULL, 0, MPI_CHAR,from, 1, comm_intra); //ack now
            else
                acks_intra[from]++; //normally we have delayed ack
            process_intra( from, length,recvbuf_intra +AGGR_intra*index);
        }
        MPI_Start( rqrecv_intra+index);
    }
}
// poll internode message
static void aml_poll(void) {
    int flag, from, length,index;
    MPI_Status status;

    aml_poll_intra();

    MPI_Testany( NRECV,rqrecv,&index, &flag, &status );
    if ( flag ) {
        MPI_Get_count( &status, MPI_CHAR, &length );
        ack -= status.MPI_TAG;
        nbytes_rcvd+=length;
        if(length>0) { //no confirmation & processing for ack only messages
            from = status.MPI_SOURCE;
            if(inbarrier)
                MPI_Send(NULL, 0, MPI_CHAR,from, 1, comm); //ack now
            else
                acks[from]++; //normally we have delayed ack
            process( from, length,recvbuf+AGGR*index );
        }
        MPI_Start( rqrecv+index );
    }
}

//flush internode buffer to destination node
inline void flush_buffer( int node ) {
    MPI_Status stsend;
    int flag=0,index,tmp;
    if (sendsize[node] == 0 && acks[node]==0 ) return;
    while (!flag) {
        aml_poll();
        MPI_Testany(NSEND,rqsend,&index,&flag,&stsend);
    }
    MPI_Isend(SENDSOURCE(node), sendsize[node], MPI_CHAR,node, acks[node], comm, rqsend+index );
    nbytes_sent+=sendsize[node];
    if (sendsize[node] > 0) ack++;
    sendsize[node] = 0;
    acks[node] = 0;
    tmp=activebuf[index]; activebuf[index]=nbuf[node]; nbuf[node]=tmp; //swap bufs

}
//flush intranode buffer, NB:node is local number of pe in group
inline void flush_buffer_intra( int node ) {
    MPI_Status stsend;
    int flag=0,index,tmp;
    if (sendsize_intra[node] == 0 && acks_intra[node]==0 ) return;
    while (!flag) {
        aml_poll_intra();
        MPI_Testany(NSEND_intra,rqsend_intra,&index,&flag,&stsend);
    }
    MPI_Isend( SENDSOURCE_intra(node), sendsize_intra[node], MPI_CHAR,
            node, acks_intra[node], comm_intra, rqsend_intra+index );
    if (sendsize_intra[node] > 0) ack_intra++;
    sendsize_intra[node] = 0;
    acks_intra[node] = 0;
    tmp=activebuf_intra[index]; activebuf_intra[index]=nbuf_intra[node]; nbuf_intra[node]=tmp; //swap bufs

}

inline void aml_send_intra(void *src, int type, int length, int local, int from) {
    //send to _another_ process from same group
    int nmax = AGGR_intra - sendsize_intra[local] - sizeof(struct hdri);
    if ( nmax < length ) {
        flush_buffer_intra(local);
    }
    char* dst = (SENDSOURCE_intra(local)+sendsize_intra[local]);
    struct hdri *h=(void*)dst;
    h->routing = GROUP_FROM_PROC(from);
    h->sz=length;
    h->hndl = type;
    sendsize_intra[local] += length+sizeof(struct hdri);

    memcpy(dst+sizeof(struct hdri),src,length);
}

SOATTR void aml_send(void *src, int type,int length, int node ) {
    if ( node == myproc )
        return aml_handlers[type](myproc,src,length);

    int group = GROUP_FROM_PROC(node);
    int local = LOCAL_FROM_PROC(node);

    //send to another node in my group
    if ( group == mygroup )
        return aml_send_intra(src,type,length,local,myproc);

    //send to another group
    int nmax = AGGR - sendsize[group]-sizeof(struct hdr);
    if ( nmax < length ) {
        flush_buffer(group);
    }
    char* dst = (SENDSOURCE(group)+sendsize[group]);
    struct hdr *h=(void*)dst;
    h->routing = local;
    h->hndl = type;
    h->sz=length;
    sendsize[group] += length+sizeof(struct hdr);
    memcpy(dst+sizeof(struct hdr),src,length);
}


int stringCmp( const void *a, const void *b)
{ return strcmp(a,b);  }

// Should be called by user instead of MPI_Init()
SOATTR int aml_init( int *argc, char ***argv ) {
    int r, i, j,tmpmax;

    r = MPI_Init(argc, argv);
    if ( r != MPI_SUCCESS ) return r;

    MPI_Comm_size( MPI_COMM_WORLD, &num_procs );
    MPI_Comm_rank( MPI_COMM_WORLD, &myproc );

    //split communicator
    char host_name[MPI_MAX_PROCESSOR_NAME];
    char (*host_names)[MPI_MAX_PROCESSOR_NAME];
    int namelen,bytes,n,color;
    MPI_Get_processor_name(host_name,&namelen);

    bytes = num_procs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
    host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
    strcpy(host_names[myproc], host_name);
    for (n=0; n<num_procs; n++)
        MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
    qsort(host_names, num_procs, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
    color = 0;
    for (n=0; n<num_procs; n++)  {
        if(n>0 && strcmp(host_names[n-1], host_names[n])) color++;
        if(strcmp(host_name, host_names[n]) == 0) break;
    }
    free(host_names);
    MPI_Comm_split(MPI_COMM_WORLD, color, myproc, &comm_intra);

    //find intranode numbers and make internode communicator
    MPI_Comm_size( comm_intra, &group_size );
    MPI_Comm_rank( comm_intra, &mylocal );

    MPI_Comm_split(MPI_COMM_WORLD, mylocal, myproc, &comm);

    MPI_Comm_size( comm, &num_groups );
    MPI_Comm_rank( comm, &mygroup );

    //first nonblocking barriers are blocking,so we call them now
    MPI_Request hndl;
    MPI_Ibarrier(comm,&hndl);
    MPI_Wait(&hndl,MPI_STATUS_IGNORE);
    MPI_Ibarrier(comm_intra,&hndl);
    MPI_Wait(&hndl,MPI_STATUS_IGNORE);

#ifndef PROCS_PER_NODE_NOT_POWER_OF_TWO
    groupmask=group_size-1;
    if((group_size&groupmask)) { printf("AML: Fatal: non power2 groupsize unsupported. Define macro PROCS_PER_NODE_NOT_POWER_OF_TWO to override\n");return -1;}
    for (loggroup = 0; loggroup < group_size; loggroup++)
        if ((1 << loggroup) == group_size) break;
#endif
    if(myproc!=PROC_FROM_GROUPLOCAL(mygroup,mylocal)) {printf("AML: Fatal: Strange group rank assignment scheme.\n");return -1;}
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);

    CPU_SET(mylocal,&cpuset); //FIXME ? would it work good enough on all architectures?
    pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
#ifdef DEBUGSTATS
    if(myproc==0) printf ("AML: multicore, num_groups %d group_size %d\n",num_groups,group_size);
#ifdef PROCS_PER_NODE_NOT_POWER_OF_TWO
    if(myproc==0) printf ("AML: multicore, PROCS_PER_NODE_NOT_POWER_OF_TWO defined\n");
#else
    if(myproc==0) printf ("AML: multicore, loggroup=%d groupmask=%d\n",loggroup,groupmask);
#endif
    if(myproc==0) printf ("NRECV=%d NRECVi=%d NSEND=%d  NSENDi=%d AGGR=%dK AGGRi=%dK\n",NRECV,NRECV_intra,NSEND,NSEND_intra,AGGR>>10,AGGR_intra>>10);
#endif
    if(num_groups>MAXGROUPS) { if(myproc==0) printf("AML:v1.0 reference:unsupported num_groups > MAXGROUPS=%d\n",MAXGROUPS); exit(-1); }
    fflush(NULL);
    //init preposted recvs: NRECV internode
    for(i=0;i<NRECV;i++)  {
        r = MPI_Recv_init( recvbuf+AGGR*i, AGGR, MPI_CHAR,MPI_ANY_SOURCE, MPI_ANY_TAG, comm,rqrecv+i );
        if ( r != MPI_SUCCESS ) return r;
    }
    sendbuf = malloc( AGGR*(num_groups+NSEND));
    if ( !sendbuf ) return -1;
    memset(sendbuf,0,AGGR*(num_groups+NSEND));
    sendsize = malloc( num_groups*sizeof(*sendsize) );
    if (!sendsize) return -1;
    acks = malloc( num_groups*sizeof(*acks) );
    if (!acks) return -1;
    nbuf = malloc( num_groups*sizeof(*nbuf) );
    if (!nbuf) return -1;


    for(i=0;i<NRECV_intra;i++)  {
        r = MPI_Recv_init( recvbuf_intra+AGGR_intra*i, AGGR_intra, MPI_CHAR,MPI_ANY_SOURCE, MPI_ANY_TAG, comm_intra,rqrecv_intra+i );
        if ( r != MPI_SUCCESS ) return r;
    }
    sendbuf_intra = malloc( AGGR_intra*(group_size+NSEND_intra));
    if ( !sendbuf_intra ) return -1;
    memset(sendbuf_intra,0,AGGR_intra*(group_size+NSEND_intra));
    sendsize_intra = malloc( group_size*sizeof(*sendsize_intra) );
    if (!sendsize_intra) return -1;
    acks_intra = malloc( group_size*sizeof(*acks_intra) );
    if (!acks_intra) return -1;
    nbuf_intra = malloc( group_size*sizeof(*nbuf_intra) );
    if (!nbuf_intra) return -1;
    for ( j = 0; j < group_size; j++ ) {
        sendsize_intra[j] = 0; nbuf_intra[j] = j; acks_intra[j]=0;
    }
    for(i=0;i<NRECV_intra;i++)
        MPI_Start(rqrecv_intra+i);

    for ( j = 0; j < NSEND_intra; j++ ) {
        MPI_Isend( NULL, 0, MPI_CHAR, MPI_PROC_NULL, 0, comm_intra, rqsend_intra+j );
        activebuf_intra[j]=group_size+j;
    }

    for ( j = 0; j < num_groups; j++ ) {
        sendsize[j] = 0; nbuf[j] = j;  acks[j]=0;
    }
    for(i=0;i<NRECV;i++)
        MPI_Start( rqrecv+i );
    for ( j = 0; j < NSEND; j++ ) {
        MPI_Isend( NULL, 0, MPI_CHAR, MPI_PROC_NULL, 0, comm, rqsend+j );
        activebuf[j]=num_groups+j;
    }
    return 0;
}

SOATTR void aml_barrier( void ) {
    int i,flag;
    MPI_Request hndl;
    inbarrier++;
    //1. flush internode buffers
    for ( i = 1; i < num_groups; i++ ) {
        int group=(mygroup+i)%num_groups;
        flush_buffer(group);
    }
    //2. wait for all internode being acknowledged
    while(ack!=0) aml_poll();
    //3. notify everybody that all my internode messages were received
    MPI_Ibarrier(comm,&hndl);
    //4. receive internode until barrier done
    flag=0;
    while(flag==0) {
        MPI_Test(&hndl,&flag,MPI_STATUS_IGNORE); aml_poll(); }
    // NB: All internode received here. I can receive some more intranode.

    //5. Flush all intranode buffers
    for ( i = 1; i < group_size; i++ ) {
        int localproc=LOCAL_FROM_PROC(mylocal+i);
        flush_buffer_intra(localproc);
    }
    //inbarrier=2;
    //6. wait for all intranode being acknowledged
    while(ack_intra!=0) aml_poll_intra();
    //7. notify everybody that all my intranode messages were received
    MPI_Ibarrier(comm_intra,&hndl);
    //8. receive internode until barrier done
    flag=0;
    while(flag==0) {
        MPI_Test(&hndl,&flag,MPI_STATUS_IGNORE); aml_poll_intra(); }
    inbarrier--;
    MPI_Barrier(MPI_COMM_WORLD);
}

SOATTR void aml_finalize( void ) {
    int i;
    aml_barrier();
    for(i=0;i<NRECV;i++)
        MPI_Cancel(rqrecv+i);
#ifndef NOINTRA
    for(i=0;i<NRECV_intra;i++)
        MPI_Cancel(rqrecv_intra+i);
#endif
    MPI_Finalize();
}

SOATTR int aml_my_pe(void) { return myproc; }
SOATTR int aml_n_pes(void) { return num_procs; }