多机多卡技术测试-单节点多DCU（数据传输型）

最新推荐文章于 2023-10-27 14:28:06 发布

Rocky_myx

最新推荐文章于 2023-10-27 14:28:06 发布

阅读量385

点赞数

分类专栏：并行计算（CUDA/HIP/多机多卡）文章标签：并行计算

本文链接：https://blog.csdn.net/qq_33562636/article/details/108761837

版权

并行计算（CUDA/HIP/多机多卡）专栏收录该内容

13 篇文章 5 订阅

订阅专栏

文件目录：
在这里插入图片描述
makefile

SOURCE =$(wildcard *.cpp)
OBJS =$(patsubst %.cpp,%,$(SOURCE))
HIPCC = /opt/rocm/bin/hipcc
GCC=/opt/rh/devtoolset-7/root/usr/bin/gcc

all:$(OBJS) 
$(OBJS):%:%.cpp
	$(HIPCC)  $^ -o  $@
run:
	./$(OBJS) 2 
clean:
	-rm $(OBJS)

simpleP2P_PingPong.cpp

#include "../common/common.h"
#include <stdlib.h>
#include <stdio.h>
#include <hip/hip_runtime.h>


__global__ void iKernel(float *src, float *dst)
{
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    dst[idx] = src[idx] * 2.0f;
}


inline bool isCapableP2P(int ngpus)
{
    hipDeviceProp_t prop[ngpus];
    int iCount = 0;

    for (int i = 0; i < ngpus; i++)
    {
        CHECK(hipGetDeviceProperties(&prop[i], i));

        if (prop[i].major >= 2) iCount++;

        printf("> GPU%d: %s %s capable of Peer-to-Peer access\n", i,prop[i].name, (prop[i].major >= 2 ? "is" : "not"));
    }

    if(iCount != ngpus)
    {
        printf("> no enough device to run this application\n");
    }

    return (iCount == ngpus);
}


inline void enableP2P (int ngpus)
{
    for( int i = 0; i < ngpus; i++ )
    {
        CHECK(hipSetDevice(i));

        for(int j = 0; j < ngpus; j++)
        {
            if(i == j) continue;

            int peer_access_available = 0;
            CHECK(hipDeviceCanAccessPeer(&peer_access_available, i, j));

            if (peer_access_available)
            {
                CHECK(hipDeviceEnablePeerAccess(j, 0));
                printf("> GPU%d enabled direct access to GPU%d\n", i, j);
            }
            else
            {
                printf("(%d, %d)\n", i, j );
            }
        }
    }
}



inline void disableP2P (int ngpus)
{
    for( int i = 0; i < ngpus; i++ )
    {
        CHECK(hipSetDevice(i));

        for(int j = 0; j < ngpus; j++)
        {
            if( i == j ) continue;

            int peer_access_available = 0;
            CHECK(hipDeviceCanAccessPeer( &peer_access_available, i, j) );

            if( peer_access_available )
            {
                CHECK(hipDeviceDisablePeerAccess(j));
                printf("> GPU%d disabled direct access to GPU%d\n", i, j);
            }
        }
    }
}


void initialData(float *ip, int size)
{
    for(int i = 0; i < size; i++)
    {
        ip[i] = (float)rand() / (float)RAND_MAX;
    }
}


int main(int argc, char **argv)
{
    int ngpus;

    // check device count
    CHECK(hipGetDeviceCount(&ngpus));
    printf("> CUDA-capable device count: %i\n", ngpus);

    // check p2p capability
    isCapableP2P(ngpus);

    // get ngpus from command line
    if (argc > 1)
    {
        if (atoi(argv[1]) > ngpus)
        {
            fprintf(stderr, "Invalid number of GPUs specified: %d is greater "
                    "than the total number of GPUs in this platform (%d)\n",
                    atoi(argv[1]), ngpus);
            return 1;
        }

        ngpus  = atoi(argv[1]);
    }

    if (ngpus > 2)
    {
        fprintf(stderr, "No more than 2 GPUs supported\n");
        return 1;
    }

    if (ngpus > 1) enableP2P(ngpus);

    // Allocate buffers
    int iSize = 1024 * 1024 * 16;
    const size_t iBytes = iSize * sizeof(float);
    printf("\nAllocating buffers (%iMB on each GPU and CPU Host)...\n",
           int(iBytes / 1024 / 1024));

    float **d_src = (float **)malloc(sizeof(float) * ngpus);
    float **d_rcv = (float **)malloc(sizeof(float) * ngpus);
    float **h_src = (float **)malloc(sizeof(float) * ngpus);
    hipStream_t *stream = (hipStream_t *)malloc(sizeof(hipStream_t) * ngpus);

    // Create CUDA event handles
    hipEvent_t start, stop;
    CHECK(hipSetDevice(0));
    CHECK(hipEventCreate(&start));
    CHECK(hipEventCreate(&stop));

    for (int i = 0; i < ngpus; i++)
    {
        CHECK(hipSetDevice(i));
        CHECK(hipMalloc(&d_src[i], iBytes));
        CHECK(hipMalloc(&d_rcv[i], iBytes));
        CHECK(hipHostMalloc((void **) &h_src[i], iBytes));

        CHECK(hipStreamCreate(&stream[i]));
    }

    for (int i = 0; i < ngpus; i++)
    {
        initialData(h_src[i], iSize);
    }

    // unidirectional gmem copy
    CHECK(hipSetDevice(0));
    CHECK(hipEventRecord(start, 0));

    for (int i = 0; i < 100; i++)
    {
        if (i % 2 == 0)
        {
            CHECK(hipMemcpy(d_src[1], d_src[0], iBytes,hipMemcpyDeviceToDevice));
        }
        else
        {
            CHECK(hipMemcpy(d_src[0], d_src[1], iBytes,hipMemcpyDeviceToDevice));
        }
    }

    CHECK(hipSetDevice(0));
    CHECK(hipEventRecord(stop, 0));
    CHECK(hipEventSynchronize(stop));

    float elapsed_time_ms;
    CHECK(hipEventElapsedTime(&elapsed_time_ms, start, stop ));

    elapsed_time_ms /= 100.0f;
    printf("Ping-pong unidirectional hipMemcpy:\t\t %8.2f ms ",
           elapsed_time_ms);
    printf("performance: %8.2f GB/s\n",
            (float)iBytes / (elapsed_time_ms * 1e6f));



    //  bidirectional asynchronous gmem copy
    CHECK(hipEventRecord(start, 0));

    for (int i = 0; i < 100; i++)
    {
        CHECK(hipMemcpyAsync(d_src[1], d_src[0], iBytes,
                    hipMemcpyDeviceToDevice, stream[0]));
        CHECK(hipMemcpyAsync(d_rcv[0], d_rcv[1], iBytes,
                    hipMemcpyDeviceToDevice, stream[1]));
    }

    CHECK(hipSetDevice(0));
    CHECK(hipEventRecord(stop, 0));
    CHECK(hipEventSynchronize(stop));

    elapsed_time_ms = 0.0f;
    CHECK(hipEventElapsedTime(&elapsed_time_ms, start, stop ));

    elapsed_time_ms /= 100.0f;
    printf("Ping-pong bidirectional hipMemcpyAsync:\t %8.2fms ",
           elapsed_time_ms);
    printf("performance: %8.2f GB/s\n",
           (float) 2.0f * iBytes / (elapsed_time_ms * 1e6f) );

    printf("\n");
    disableP2P(ngpus);

    // free
    CHECK(hipSetDevice(0));
    CHECK(hipEventDestroy(start));
    CHECK(hipEventDestroy(stop));

    for (int i = 0; i < ngpus; i++)
    {
        CHECK(hipSetDevice(i));
        CHECK(hipFree(d_src[i]));
        CHECK(hipFree(d_rcv[i]));
        CHECK(hipStreamDestroy(stream[i]));
        CHECK(hipDeviceReset());
    }

    exit(EXIT_SUCCESS);
}

运行结果：
在这里插入图片描述

Rocky_myx

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
多机多卡技术测试-单节点多DCU（数据传输型）

文件目录：makefileSOURCE =$(wildcard *.cpp)OBJS =$(patsubst %.cpp,%,$(SOURCE))HIPCC = /opt/rocm/bin/hipccGCC=/opt/rh/devtoolset-7/root/usr/bin/gccall:$(OBJS) $(OBJS):%:%.cpp $(HIPCC) $^ -o $@run: ./$(OBJS) 2 clean: -rm $(OBJS)simpleP2P_PingPon
复制链接

扫一扫

专栏目录