探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小


探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小

一.相关链接

二.观察到的现象

  • GPU0通过PeerAccess从GPU1读数据,发现GPU0的PCIE有21%的TX ,同时GPU1的PCIE也有21%的RX.像是在这个过程中GPU0在通过PCIE配置GPU1
  • 在Kernel执行过程中二个GPU的BAR1并没有被访问,NV也没有提供BAR1的metrics,也有可能GPU0在通过GPU1的BAR0配置GPU1的BAR1窗口映射
  • 如果是Kernel过程中GPU0产生MMU缺页中断,让HOST驱动通过BAR0去配置BAR1的映射关系,那GPU0的PCIE应该不会出现TX的利用率
  • 下一步可以通过逻辑分析抓包,进一步分析请添加图片描述
    请添加图片描述
    请添加图片描述
    请添加图片描述

三.升级到cuda_12.6.2[可选]

wget https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
sudo apt-get --purge -y remove 'nvidia*'
sh cuda_12.6.2_560.35.03_linux.run

四.安装open-gpu-kernel-modules[可选,如果需要调试NV驱动源码]

git clone --branch 560.35.03 --single-branch https://github.com/NVIDIA/open-gpu-kernel-modules.git
git branch
git checkout -b 560.35.03

rmmod nvidia-uvm
rmmod nvidia-drm
rmmod nvidia-modeset
rmmod nvidia
dmesg  -C

insmod kernel-open/nvidia.ko
insmod kernel-open/nvidia-modeset.ko
insmod kernel-open/nvidia-drm.ko
insmod kernel-open/nvidia-uvm.ko

五.测试Kernel中访问Host内存以及H2D

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while (0)

template<int mode>
__global__ void copyKernel(float *input_data,float *output_data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    output_data[idx]=input_data[idx];
}

template<int mode>
__global__ void incKernel(float *input_data,float *output_data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    input_data[idx]=input_data[idx]+=1;
}

template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ 
    CUDA_CHECK(cudaDeviceSynchronize());
    auto start = std::chrono::high_resolution_clock::now();
    cudaEventRecord(start_ev, stream); 
    f(stream); 
    cudaEventRecord(stop_ev, stream); 
    CUDA_CHECK(cudaEventSynchronize(stop_ev)); 
    auto end = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> diff = end - start; 
    float milliseconds = 0; 
    cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); 
    printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}

int main() {
    int devID0 = 0;
    
    #define block_size 1024L
    #define block_count ((1024<<20)/block_size/4)  //超过BAR大小
    
    size_t dataSize = block_count * block_size * sizeof(float);
    float *data0;
    float *data1;

    CUDA_CHECK(cudaSetDevice(devID0));
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    CUDA_CHECK(cudaMallocHost(&data0,dataSize));
    CUDA_CHECK(cudaMalloc(&data1, dataSize));
    
    cudaEvent_t start_ev, stop_ev;
    cudaEventCreate(&start_ev);
    cudaEventCreate(&stop_ev);
    TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(data1,data0,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);
    TIMEIT([&](cudaStream_t &stream)-> void {copyKernel<1><<<block_count, block_size,0,stream>>>(data0,data1);},stream,start_ev,stop_ev);
    TIMEIT([&](cudaStream_t &stream)-> void {incKernel<1><<<block_count, block_size,0,stream>>>(data0,data1);},stream,start_ev,stop_ev);
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
/usr/local/cuda/bin/nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx --gpu-metrics-frequency=100 --gpu-metrics-devices=0 ./p2p
/usr/local/cuda/bin/ncu --metrics \
dram__bytes_read.sum,\
dram__bytes_write.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_device.sum,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
lts__t_bytes.sum,\
smsp__sass_inst_executed_op_global_ld.sum ./p2p

请添加图片描述
L2的metrics可以看到,请求全部去了peer
请添加图片描述

六.准备pcm,监控HOST Memory的带宽,用来确定PeerAccess是否用通过了Host Memory

git clone --recursive https://github.com/intel/pcm
git submodule update --init --recursive
mkdir build
cd build
cmake ..
cmake --build .
cmake --build . --config Release
./bin/pcm-memory

七.测试PeerAccess

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
            exit(EXIT_FAILURE); \
        } \
    } while (0)

template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    output_data[idx]=input_data[idx];
}

template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ 
    CUDA_CHECK(cudaDeviceSynchronize());
    auto start = std::chrono::high_resolution_clock::now();
    cudaEventRecord(start_ev, stream); 
    f(stream); 
    cudaEventRecord(stop_ev, stream); 
    CUDA_CHECK(cudaEventSynchronize(stop_ev)); 
    auto end = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> diff = end - start; 
    float milliseconds = 0; 
    cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); 
    printf("E2E:%7.2fms Kernel:%7.2fms errno:%d\n",diff.count()*1000,milliseconds,cudaGetLastError());
}

int main() {
    int devID0 = 0, devID1 = 1;
    int device_count=0;
    CUDA_CHECK(cudaGetDeviceCount(&device_count));
    for(int deviceid=0; deviceid<2;deviceid++)
    {
        CUDA_CHECK(cudaSetDevice(deviceid));  
        cudaDeviceProp deviceProp;
        CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));
        std::cout << "-----------------------------------" << std::endl;
        std::cout << "Device Index: " << deviceid << std::endl;
        std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
        std::cout << "Device name: " << deviceProp.name << std::endl;
        std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
        std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;
        std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;
        std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;
        std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;
        std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;
        std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;
    }
    
    std::cout << "-----------------------------------" << std::endl;
    int p2p_value=0;
    CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));
    std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;
    
    #define block_size 1024L
    #define block_count ((512<<20)/block_size/4)
    
    size_t dataSize = block_count*block_size * sizeof(float);
    float *data0_dev, *data1_dev,*data1_dev_ex;

    CUDA_CHECK(cudaSetDevice(devID0));
    CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));

    CUDA_CHECK(cudaSetDevice(devID1));
    CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));
    CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));
    float *host;
    CUDA_CHECK(cudaMallocHost(&host,dataSize));

    printf("Init Done(%.2f)MB..\n",dataSize/1024.0/1024.0);
    // 启用P2P
    int canAccessPeer=0;
    CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));
    if (canAccessPeer) {
        CUDA_CHECK(cudaSetDevice(devID1));
        cudaStream_t stream;
        cudaStreamCreate(&stream);

        cudaEvent_t start_ev, stop_ev;
        cudaEventCreate(&start_ev);
        cudaEventCreate(&stop_ev);
        
        CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存
        do
        {
            //TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(host,data1_dev);},stream,start_ev,stop_ev);
            TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);
        }while(0)
        CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));
    }
    else
    {
        printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);
    }

    CUDA_CHECK(cudaFreeHost(host));
    CUDA_CHECK(cudaFree(data0_dev));
    CUDA_CHECK(cudaFree(data1_dev));
    CUDA_CHECK(cudaFree(data1_dev_ex));
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
export CUDA_VISIBLE_DEVICES=6,7
/usr/local/cuda/bin/ncu --metrics \
dram__bytes_read.sum,\
dram__bytes_write.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_device.sum,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
lts__t_bytes.sum,\
smsp__sass_inst_executed_op_global_ld.sum ./p2p
/usr/local/cuda/bin/nsys profile --stats=true -o cuda_profing_report_p2p.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=4,7 ./p2p

请添加图片描述
请添加图片描述请添加图片描述
实验是明PeerAccess时,没有经过Host Memory

八.用devmem直接读GPU BAR1(测试nsys中的Pcie Read Request to BAR1 Metric是否只记录GPU驱动对BAR1的读写)

结论是:该Metric也可以统计GPU驱动以外对BAR1的访问

lspci -s `nvidia-smi  -q | grep "Bus Id" | awk '{print $4}'` -v | grep "Memory at" | sed -n "2,1p"
Memory at 383fd0000000 (64-bit, prefetchable) [size=256M]

tee devmem.c<<-'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/time.h>
#include <time.h>
#include <termios.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <string.h>
#include <semaphore.h>
#include <stdint.h>
#include <pthread.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/file.h>

#define FATAL do { fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \
  __LINE__, __FILE__, errno, strerror(errno)); exit(1); } while(0)

#define MAP_SIZE (32<<20)
#define MAP_MASK (MAP_SIZE - 1)


unsigned long GetTickCount()
{
  struct timeval tv;
  if( gettimeofday(&tv, NULL) != 0 )
    return 0;
  return (tv.tv_sec * 1000000) + (tv.tv_usec);
}

#define ALIGN_UP(x, a)           ( ( ((x) + ((a) - 1) ) / a ) * a )

int main(int argc, char **argv) {
    int fd;
    void *map_base, *virt_addr;
    unsigned long read_result, writeval;
    off_t target;

    target = strtoul(argv[1], 0, 16)& ~MAP_MASK;
    unsigned int size=atoi(argv[2]);
    unsigned char value=atoi(argv[3]);

    if((fd = open("/dev/mem", O_RDWR | O_SYNC, S_IRWXU)) == -1) FATAL;
    fflush(stdout);

    map_base = mmap(0, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, target);
    if(map_base == (void *) -1) FATAL;
    fflush(stdout);

    virt_addr = map_base;

    unsigned char *wbuffer=(unsigned char*)malloc(size);
    for(int i=0;i<size;i++)
    {
      wbuffer[i]=value;
    }
    unsigned char *rbuffer=(unsigned char*)malloc(size);
    for(int i=0;i<size;i++)
    {
      rbuffer[i]=0x51;
    }
    unsigned long t0=GetTickCount();
    memcpy(virt_addr,wbuffer,size);
    msync(virt_addr, size, MS_SYNC);
    __asm__ __volatile__("" ::: "memory");
    unsigned long t1=GetTickCount();
    printf("mem:%llx %ld(usec)\n",target,t1-t0);

    memcpy(rbuffer,virt_addr,size);
    msync(rbuffer, size, MS_SYNC);
    __asm__ __volatile__("" ::: "memory");//如果不加,结果会不一致

    for(int i=0;i<size;i++)
    {
      if(wbuffer[i]!=rbuffer[i])
      {
        printf("mismatch at %d %02x %02x\n",i,wbuffer[i],rbuffer[i]);
        break;
      }
    }
    if(munmap(map_base, MAP_SIZE) == -1) FATAL;
    close(fd);
    return 0;
}

EOF
gcc -o devmem devmem.c  -std=c99 -g
/usr/local/cuda/bin/nsys profile --stats=true \
-o bar1_access.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=0 ./devmem 383fd0000000 32 32
/usr/local/cuda/bin/nsys profile --stats=true \
-o bar1_access.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=0 ./devmem 383fd0000000 1024 32

请添加图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Hi20240217

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值