rocm PeerAccess 测试


rocm PeerAccess 测试

一.参考链接

二.测试过程

1.登录服务器


2.使用smi获取列表

rocm-smi

输出

=========================================== ROCm System Management Interface ===========================================
===================================================== Concise Info =====================================================
Device  Node  IDs              Temp    Power     Partitions          SCLK    MCLK    Fan    Perf  PwrCap  VRAM%  GPU%
              (DID,     GUID)  (Edge)  (Socket)  (Mem, Compute, ID)
========================================================================================================================
0       1     0x66a1,   3820   35.0°C  20.0W     N/A, N/A, 0         860Mhz  350Mhz  9.41%  auto  190.0W  0%     0%
1       2     0x66a1,   22570  38.0°C  17.0W     N/A, N/A, 0         860Mhz  350Mhz  9.41%  auto  190.0W  0%     0%
========================================================================================================================
================================================= End of ROCm SMI Log ==================================================

rocm-smi --showbus
======================================= PCI Bus ID =======================================
GPU[0]          : PCI Bus: 0000:07:00.0
GPU[1]          : PCI Bus: 0000:0E:00.0
==========================================================================================

lspci -s 0000:07:00.0 -vv | grep "Region"
lspci -s 0000:0E:00.0 -vv | grep "Region"

Region 0: Memory at d0000000 (64-bit, prefetchable) [size=256M]
Region 2: Memory at e0000000 (64-bit, prefetchable) [size=2M]
Region 4: I/O ports at e000 [size=256]
Region 5: Memory at fc400000 (32-bit, non-prefetchable) [size=512K]

Region 0: Memory at b0000000 (64-bit, prefetchable) [size=256M]
Region 2: Memory at c0000000 (64-bit, prefetchable) [size=2M]
Region 4: I/O ports at f000 [size=256]
Region 5: Memory at fcd00000 (32-bit, non-prefetchable) [size=512K]

rocm-smi --showtopo
============================ ROCm System Management Interface ============================
================================ Weight between two GPUs =================================
       GPU0         GPU1
GPU0   0            40
GPU1   40           0

================================= Hops between two GPUs ==================================
       GPU0         GPU1
GPU0   0            2
GPU1   2            0

=============================== Link Type between two GPUs ===============================
       GPU0         GPU1
GPU0   0            PCIE
GPU1   PCIE         0

======================================= Numa Nodes =======================================
GPU[0]          : (Topology) Numa Node: 0
GPU[0]          : (Topology) Numa Affinity: -1
GPU[1]          : (Topology) Numa Node: 0
GPU[1]          : (Topology) Numa Affinity: -1
================================== End of ROCm SMI Log ===================================

4.准备测试用例

tee p2p.cpp<<-'EOF'
#include <iostream>
// hip header file
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <thread>

#define HIP_CHECK(call) \
    do { \
        hipError_t error = call; \
        if (error != hipSuccess) { \
            fprintf(stderr, "HIP error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, hipGetErrorString(error)); \
            return -1; \
        } \
    } while (0)

template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    output_data[idx]=input_data[idx];
}

template <typename F>
int TIMEIT(F const &f,hipStream_t &stream,hipEvent_t &start_ev,hipEvent_t&stop_ev)
{ 
    HIP_CHECK(hipDeviceSynchronize());
    auto start = std::chrono::high_resolution_clock::now();
    HIP_CHECK(hipEventRecord(start_ev, stream)); 
    f(stream); 
    HIP_CHECK(hipEventRecord(stop_ev, stream)); 
    HIP_CHECK(hipEventSynchronize(stop_ev)); 
    auto end = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> diff = end - start; 
    float milliseconds = 0; 
    HIP_CHECK(hipEventElapsedTime(&milliseconds, start_ev, stop_ev)); 
    printf("E2E:%7.2fms Kernel:%7.2fms errno:%d\n",diff.count()*1000,milliseconds,hipGetLastError());
    return 0;
}

int main() {
    int devID0 = 0, devID1 = 1;
    int device_count=0;
    HIP_CHECK(hipGetDeviceCount(&device_count));
    for(int deviceid=0; deviceid<device_count;deviceid++)
    {
        std::cout << "---------------"<<"dev:"<<deviceid<<"--------------------" << std::endl;
        HIP_CHECK(hipSetDevice(deviceid));
        int value;
        HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeCudaCompatibleBegin,deviceid));
        std::cout << "hipDeviceAttributeCudaCompatibleBegin: " << value << std::endl;

        HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeAsyncEngineCount,deviceid));
        std::cout << "hipDeviceAttributeAsyncEngineCount: " << value << std::endl;

        HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeCanMapHostMemory,deviceid));
        std::cout << "hipDeviceAttributeCanMapHostMemory: " << value << std::endl;

        HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeManagedMemory,deviceid));
        std::cout << "hipDeviceAttributeManagedMemory: " << value << std::endl;

        HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeDirectManagedMemAccessFromHost,deviceid));
        std::cout << "hipDeviceAttributeDirectManagedMemAccessFromHost: " << value << std::endl;

        HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeUnifiedAddressing,deviceid));
        std::cout << "hipDeviceAttributeUnifiedAddressing: " << value << std::endl;
    }
    std::cout << "-----------------------------------" << std::endl;
    int p2p_value=0;
    HIP_CHECK(hipDeviceGetP2PAttribute(&p2p_value,hipDevP2PAttrAccessSupported,devID0,devID1));
    std::cout << "hipDevP2PAttrAccessSupported: " << p2p_value << std::endl;
    
    #define block_size 1024L
    #define block_count ((512<<20)/block_size/4)
    
    size_t dataSize = block_count*block_size * sizeof(float);
    float *data0_dev, *data1_dev,*data1_dev_ex;

    HIP_CHECK(hipSetDevice(devID0));
    HIP_CHECK(hipMalloc(&data0_dev, dataSize));

    HIP_CHECK(hipSetDevice(devID1));
    HIP_CHECK(hipMalloc(&data1_dev, dataSize));
    HIP_CHECK(hipMalloc(&data1_dev_ex, dataSize));
    HIP_CHECK(hipMemcpy(data0_dev,data1_dev,dataSize,hipMemcpyDeviceToDevice));
    HIP_CHECK(hipMemcpy(data0_dev,data1_dev,dataSize,hipMemcpyDeviceToDeviceNoCU));

    printf("Init Done(%.2f)MB..\n",dataSize/1024.0/1024.0);
    // 启用P2P
    int canAccessPeer=0;
    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));
    if (canAccessPeer) {
        HIP_CHECK(hipSetDevice(devID1));
        hipStream_t stream;
        HIP_CHECK(hipStreamCreate(&stream));

        hipEvent_t start_ev, stop_ev;
        HIP_CHECK(hipEventCreate(&start_ev));
        HIP_CHECK(hipEventCreate(&stop_ev));
        
        HIP_CHECK(hipDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存

        TIMEIT([&](hipStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);
        TIMEIT([&](hipStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data1_dev_ex,data1_dev);},stream,start_ev,stop_ev);

        HIP_CHECK(hipDeviceDisablePeerAccess(devID0));
    }
    else
    {
        printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);
    }

    HIP_CHECK(hipFree(data0_dev));
    HIP_CHECK(hipFree(data1_dev));
    HIP_CHECK(hipFree(data1_dev_ex));
    return 0;
}
EOF
/opt/rocm/bin/hipcc -c p2p.cpp -o p2p.cpp.o
/opt/rocm/bin/hipcc p2p.cpp.o -o p2p 
./p2p
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Hi20240217

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值