探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小
探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小
一.相关链接
- IOMMU VFIO GROUP
- DMAR表 + iommu
- 什么是IOMMU
- Shared Virtual Addressing for high performance Arm Infrastructure platforms
- Pcie Read Request to BAR1
二.观察到的现象
- GPU0通过PeerAccess从GPU1读数据,发现GPU0的PCIE有21%的TX ,同时GPU1的PCIE也有21%的RX.像是在这个过程中GPU0在通过PCIE配置GPU1
- 在Kernel执行过程中二个GPU的BAR1并没有被访问,NV也没有提供BAR1的metrics,也有可能GPU0在通过GPU1的BAR0配置GPU1的BAR1窗口映射
- 如果是Kernel过程中GPU0产生MMU缺页中断,让HOST驱动通过BAR0去配置BAR1的映射关系,那GPU0的PCIE应该不会出现TX的利用率
- 下一步可以通过逻辑分析抓包,进一步分析




三.升级到cuda_12.6.2[可选]
wget https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
sudo apt-get --purge -y remove 'nvidia*'
sh cuda_12.6.2_560.35.03_linux.run
四.安装open-gpu-kernel-modules[可选,如果需要调试NV驱动源码]
git clone --branch 560.35.03 --single-branch https://github.com/NVIDIA/open-gpu-kernel-modules.git
git branch
git checkout -b 560.35.03
rmmod nvidia-uvm
rmmod nvidia-drm
rmmod nvidia-modeset
rmmod nvidia
dmesg -C
insmod kernel-open/nvidia.ko
insmod kernel-open/nvidia-modeset.ko
insmod kernel-open/nvidia-drm.ko
insmod kernel-open/nvidia-uvm.ko
五.测试Kernel中访问Host内存以及H2D
tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
template<int mode>
__global__ void copyKernel(float *input_data,float *output_data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
output_data[idx]=input_data[idx];
}
template<int mode>
__global__ void incKernel(float *input_data,float *output_data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
input_data[idx]=input_data[idx]+=1;
}
template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{
CUDA_CHECK(cudaDeviceSynchronize());
auto start = std::chrono::high_resolution_clock::now();
cudaEventRecord(start_ev, stream);
f(stream);
cudaEventRecord(stop_ev, stream);
CUDA_CHECK(cudaEventSynchronize(stop_ev));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);
printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}
int main() {
int devID0 = 0;
#define block_size 1024L
#define block_count ((1024<<20)/block_size/4) //超过BAR大小
size_t dataSize = block_count * block_size * sizeof(float);
float *data0;
float *data1;
CUDA_CHECK(cudaSetDevice(devID0));
cudaStream_t stream;
cudaStreamCreate(&stream);
CUDA_CHECK(cudaMallocHost(&data0,dataSize));
CUDA_CHECK(cudaMalloc(&data1, dataSize));
cudaEvent_t start_ev, stop_ev;
cudaEventCreate(&start_ev);
cudaEventCreate(&stop_ev);
TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(data1,data0,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);
TIMEIT([&](cudaStream_t &stream)-> void {copyKernel<1><<<block_count, block_size,0,stream>>>(data0,data1);},stream,start_ev,stop_ev);
TIMEIT([&](cudaStream_t &stream)-> void {incKernel<1><<<block_count, block_size,0,stream>>>(data0,data1);},stream,start_ev,stop_ev);
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
/usr/local/cuda/bin/nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx --gpu-metrics-frequency=100 --gpu-metrics-devices=0 ./p2p
/usr/local/cuda/bin/ncu --metrics \
dram__bytes_read.sum,\
dram__bytes_write.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_device.sum,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
lts__t_bytes.sum,\
smsp__sass_inst_executed_op_global_ld.sum ./p2p

L2的metrics可以看到,请求全部去了peer

六.准备pcm,监控HOST Memory的带宽,用来确定PeerAccess是否用通过了Host Memory
git clone --recursive https://github.com/intel/pcm
git submodule update --init --recursive
mkdir build
cd build
cmake ..
cmake --build .
cmake --build . --config Release
./bin/pcm-memory
七.测试PeerAccess
tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
output_data[idx]=input_data[idx];
}
template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{
CUDA_CHECK(cudaDeviceSynchronize());
auto start = std::chrono::high_resolution_clock::now();
cudaEventRecord(start_ev, stream);
f(stream);
cudaEventRecord(stop_ev, stream);
CUDA_CHECK(cudaEventSynchronize(stop_ev));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);
printf("E2E:%7.2fms Kernel:%7.2fms errno:%d\n",diff.count()*1000,milliseconds,cudaGetLastError());
}
int main() {
int devID0 = 0, devID1 = 1;
int device_count=0;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
for(int deviceid=0; deviceid<2;deviceid++)
{
CUDA_CHECK(cudaSetDevice(deviceid));
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));
std::cout << "-----------------------------------" << std::endl;
std::cout << "Device Index: " << deviceid << std::endl;
std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
std::cout << "Device name: " << deviceProp.name << std::endl;
std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;
std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;
std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;
std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;
std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;
std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;
}
std::cout << "-----------------------------------" << std::endl;
int p2p_value=0;
CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));
std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;
#define block_size 1024L
#define block_count ((512<<20)/block_size/4)
size_t dataSize = block_count*block_size * sizeof(float);
float *data0_dev, *data1_dev,*data1_dev_ex;
CUDA_CHECK(cudaSetDevice(devID0));
CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));
CUDA_CHECK(cudaSetDevice(devID1));
CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));
CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));
float *host;
CUDA_CHECK(cudaMallocHost(&host,dataSize));
printf("Init Done(%.2f)MB..\n",dataSize/1024.0/1024.0);
// 启用P2P
int canAccessPeer=0;
CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));
if (canAccessPeer) {
CUDA_CHECK(cudaSetDevice(devID1));
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start_ev, stop_ev;
cudaEventCreate(&start_ev);
cudaEventCreate(&stop_ev);
CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存
do
{
//TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(host,data1_dev);},stream,start_ev,stop_ev);
TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);
}while(0)
CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));
}
else
{
printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);
}
CUDA_CHECK(cudaFreeHost(host));
CUDA_CHECK(cudaFree(data0_dev));
CUDA_CHECK(cudaFree(data1_dev));
CUDA_CHECK(cudaFree(data1_dev_ex));
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
export CUDA_VISIBLE_DEVICES=6,7
/usr/local/cuda/bin/ncu --metrics \
dram__bytes_read.sum,\
dram__bytes_write.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_device.sum,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
lts__t_bytes.sum,\
smsp__sass_inst_executed_op_global_ld.sum ./p2p
/usr/local/cuda/bin/nsys profile --stats=true -o cuda_profing_report_p2p.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=4,7 ./p2p



实验是明PeerAccess时,没有经过Host Memory
八.用devmem直接读GPU BAR1(测试nsys中的Pcie Read Request to BAR1 Metric是否只记录GPU驱动对BAR1的读写)
结论是:该Metric也可以统计GPU驱动以外对BAR1的访问
lspci -s `nvidia-smi -q | grep "Bus Id" | awk '{print $4}'` -v | grep "Memory at" | sed -n "2,1p"
Memory at 383fd0000000 (64-bit, prefetchable) [size=256M]
tee devmem.c<<-'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/time.h>
#include <time.h>
#include <termios.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <string.h>
#include <semaphore.h>
#include <stdint.h>
#include <pthread.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/file.h>
#define FATAL do { fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \
__LINE__, __FILE__, errno, strerror(errno)); exit(1); } while(0)
#define MAP_SIZE (32<<20)
#define MAP_MASK (MAP_SIZE - 1)
unsigned long GetTickCount()
{
struct timeval tv;
if( gettimeofday(&tv, NULL) != 0 )
return 0;
return (tv.tv_sec * 1000000) + (tv.tv_usec);
}
#define ALIGN_UP(x, a) ( ( ((x) + ((a) - 1) ) / a ) * a )
int main(int argc, char **argv) {
int fd;
void *map_base, *virt_addr;
unsigned long read_result, writeval;
off_t target;
target = strtoul(argv[1], 0, 16)& ~MAP_MASK;
unsigned int size=atoi(argv[2]);
unsigned char value=atoi(argv[3]);
if((fd = open("/dev/mem", O_RDWR | O_SYNC, S_IRWXU)) == -1) FATAL;
fflush(stdout);
map_base = mmap(0, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, target);
if(map_base == (void *) -1) FATAL;
fflush(stdout);
virt_addr = map_base;
unsigned char *wbuffer=(unsigned char*)malloc(size);
for(int i=0;i<size;i++)
{
wbuffer[i]=value;
}
unsigned char *rbuffer=(unsigned char*)malloc(size);
for(int i=0;i<size;i++)
{
rbuffer[i]=0x51;
}
unsigned long t0=GetTickCount();
memcpy(virt_addr,wbuffer,size);
msync(virt_addr, size, MS_SYNC);
__asm__ __volatile__("" ::: "memory");
unsigned long t1=GetTickCount();
printf("mem:%llx %ld(usec)\n",target,t1-t0);
memcpy(rbuffer,virt_addr,size);
msync(rbuffer, size, MS_SYNC);
__asm__ __volatile__("" ::: "memory");//如果不加,结果会不一致
for(int i=0;i<size;i++)
{
if(wbuffer[i]!=rbuffer[i])
{
printf("mismatch at %d %02x %02x\n",i,wbuffer[i],rbuffer[i]);
break;
}
}
if(munmap(map_base, MAP_SIZE) == -1) FATAL;
close(fd);
return 0;
}
EOF
gcc -o devmem devmem.c -std=c99 -g
/usr/local/cuda/bin/nsys profile --stats=true \
-o bar1_access.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=0 ./devmem 383fd0000000 32 32
/usr/local/cuda/bin/nsys profile --stats=true \
-o bar1_access.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=0 ./devmem 383fd0000000 1024 32


被折叠的 条评论
为什么被折叠?



