rocm PeerAccess 测试
rocm PeerAccess 测试
一.参考链接
二.测试过程
1.登录服务器
2.使用smi获取列表
rocm-smi
输出
=========================================== ROCm System Management Interface ===========================================
===================================================== Concise Info =====================================================
Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
(DID, GUID) (Edge) (Socket) (Mem, Compute, ID)
========================================================================================================================
0 1 0x66a1, 3820 35.0°C 20.0W N/A, N/A, 0 860Mhz 350Mhz 9.41% auto 190.0W 0% 0%
1 2 0x66a1, 22570 38.0°C 17.0W N/A, N/A, 0 860Mhz 350Mhz 9.41% auto 190.0W 0% 0%
========================================================================================================================
================================================= End of ROCm SMI Log ==================================================
rocm-smi --showbus
======================================= PCI Bus ID =======================================
GPU[0] : PCI Bus: 0000:07:00.0
GPU[1] : PCI Bus: 0000:0E:00.0
==========================================================================================
lspci -s 0000:07:00.0 -vv | grep "Region"
lspci -s 0000:0E:00.0 -vv | grep "Region"
Region 0: Memory at d0000000 (64-bit, prefetchable) [size=256M]
Region 2: Memory at e0000000 (64-bit, prefetchable) [size=2M]
Region 4: I/O ports at e000 [size=256]
Region 5: Memory at fc400000 (32-bit, non-prefetchable) [size=512K]
Region 0: Memory at b0000000 (64-bit, prefetchable) [size=256M]
Region 2: Memory at c0000000 (64-bit, prefetchable) [size=2M]
Region 4: I/O ports at f000 [size=256]
Region 5: Memory at fcd00000 (32-bit, non-prefetchable) [size=512K]
rocm-smi --showtopo
============================ ROCm System Management Interface ============================
================================ Weight between two GPUs =================================
GPU0 GPU1
GPU0 0 40
GPU1 40 0
================================= Hops between two GPUs ==================================
GPU0 GPU1
GPU0 0 2
GPU1 2 0
=============================== Link Type between two GPUs ===============================
GPU0 GPU1
GPU0 0 PCIE
GPU1 PCIE 0
======================================= Numa Nodes =======================================
GPU[0] : (Topology) Numa Node: 0
GPU[0] : (Topology) Numa Affinity: -1
GPU[1] : (Topology) Numa Node: 0
GPU[1] : (Topology) Numa Affinity: -1
================================== End of ROCm SMI Log ===================================
4.准备测试用例
tee p2p.cpp<<-'EOF'
#include <iostream>
// hip header file
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <thread>
#define HIP_CHECK(call) \
do { \
hipError_t error = call; \
if (error != hipSuccess) { \
fprintf(stderr, "HIP error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, hipGetErrorString(error)); \
return -1; \
} \
} while (0)
template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
output_data[idx]=input_data[idx];
}
template <typename F>
int TIMEIT(F const &f,hipStream_t &stream,hipEvent_t &start_ev,hipEvent_t&stop_ev)
{
HIP_CHECK(hipDeviceSynchronize());
auto start = std::chrono::high_resolution_clock::now();
HIP_CHECK(hipEventRecord(start_ev, stream));
f(stream);
HIP_CHECK(hipEventRecord(stop_ev, stream));
HIP_CHECK(hipEventSynchronize(stop_ev));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
float milliseconds = 0;
HIP_CHECK(hipEventElapsedTime(&milliseconds, start_ev, stop_ev));
printf("E2E:%7.2fms Kernel:%7.2fms errno:%d\n",diff.count()*1000,milliseconds,hipGetLastError());
return 0;
}
int main() {
int devID0 = 0, devID1 = 1;
int device_count=0;
HIP_CHECK(hipGetDeviceCount(&device_count));
for(int deviceid=0; deviceid<device_count;deviceid++)
{
std::cout << "---------------"<<"dev:"<<deviceid<<"--------------------" << std::endl;
HIP_CHECK(hipSetDevice(deviceid));
int value;
HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeCudaCompatibleBegin,deviceid));
std::cout << "hipDeviceAttributeCudaCompatibleBegin: " << value << std::endl;
HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeAsyncEngineCount,deviceid));
std::cout << "hipDeviceAttributeAsyncEngineCount: " << value << std::endl;
HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeCanMapHostMemory,deviceid));
std::cout << "hipDeviceAttributeCanMapHostMemory: " << value << std::endl;
HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeManagedMemory,deviceid));
std::cout << "hipDeviceAttributeManagedMemory: " << value << std::endl;
HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeDirectManagedMemAccessFromHost,deviceid));
std::cout << "hipDeviceAttributeDirectManagedMemAccessFromHost: " << value << std::endl;
HIP_CHECK(hipDeviceGetAttribute(&value,hipDeviceAttributeUnifiedAddressing,deviceid));
std::cout << "hipDeviceAttributeUnifiedAddressing: " << value << std::endl;
}
std::cout << "-----------------------------------" << std::endl;
int p2p_value=0;
HIP_CHECK(hipDeviceGetP2PAttribute(&p2p_value,hipDevP2PAttrAccessSupported,devID0,devID1));
std::cout << "hipDevP2PAttrAccessSupported: " << p2p_value << std::endl;
#define block_size 1024L
#define block_count ((512<<20)/block_size/4)
size_t dataSize = block_count*block_size * sizeof(float);
float *data0_dev, *data1_dev,*data1_dev_ex;
HIP_CHECK(hipSetDevice(devID0));
HIP_CHECK(hipMalloc(&data0_dev, dataSize));
HIP_CHECK(hipSetDevice(devID1));
HIP_CHECK(hipMalloc(&data1_dev, dataSize));
HIP_CHECK(hipMalloc(&data1_dev_ex, dataSize));
HIP_CHECK(hipMemcpy(data0_dev,data1_dev,dataSize,hipMemcpyDeviceToDevice));
HIP_CHECK(hipMemcpy(data0_dev,data1_dev,dataSize,hipMemcpyDeviceToDeviceNoCU));
printf("Init Done(%.2f)MB..\n",dataSize/1024.0/1024.0);
// 启用P2P
int canAccessPeer=0;
HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));
if (canAccessPeer) {
HIP_CHECK(hipSetDevice(devID1));
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
hipEvent_t start_ev, stop_ev;
HIP_CHECK(hipEventCreate(&start_ev));
HIP_CHECK(hipEventCreate(&stop_ev));
HIP_CHECK(hipDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存
TIMEIT([&](hipStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);
TIMEIT([&](hipStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data1_dev_ex,data1_dev);},stream,start_ev,stop_ev);
HIP_CHECK(hipDeviceDisablePeerAccess(devID0));
}
else
{
printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);
}
HIP_CHECK(hipFree(data0_dev));
HIP_CHECK(hipFree(data1_dev));
HIP_CHECK(hipFree(data1_dev_ex));
return 0;
}
EOF
/opt/rocm/bin/hipcc -c p2p.cpp -o p2p.cpp.o
/opt/rocm/bin/hipcc p2p.cpp.o -o p2p
./p2p
576

被折叠的 条评论
为什么被折叠?



