编译 , Ampere 显卡,rtx 3060 3070...
nvcc -arch=sm_86 -o hello hello_ptx.cu
或写成Makefile:
hello: hello_sm_id.cu
nvcc -arch=sm_86 -o $@ $^
#nvcc -arch=sm_86 -o hello hello_sm_id.cu
$@ 是指目标
$^ 是指第一个依赖 ^^
hello_ptx.cu
#include <stdio.h>
#include <stdint.h>
static __device__ __inline__ uint32_t __mysmid(){
uint32_t ssmid;
asm volatile("mov.u32 %0, %%smid;" : "=r"(ssmid));
return ssmid;}
static __device__ __inline__ uint32_t __mywarpid(){
uint32_t warpid;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
return warpid;}
static __device__ __inline__ uint32_t __mylaneid(){
uint32_t laneid;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
return laneid;}
__global__ void mykernel(){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
unsigned thx = threadIdx.x;
// if(threadIdx.x==1023)// && blockIdx.x<3)
// if(threadIdx.x==1)
// if((thx==0 || thx==32 || thx==64 || thx==96 || thx==128 || thx==160 || thx==192 || thx==224 || thx==256) && blockIdx.x==0)
if(blockIdx.x<=33 && thx==0)
printf("I am thread %d, my SM ID is %d, my warp ID is %d, and my warp lane is %d\n", idx, __mysmid(), __mywarpid(), __mylaneid());
}
int main(){
dim3 grid_;
dim3 block_;
grid_.x=34;
block_.x=1024;
mykernel<<<grid_,block_>>>();
cudaDeviceSynchronize();
return 0;
}
//$ nvcc -arch=sm_20 -o hello hello_ptx.cu
运行:
结果分析:
rtx3060中,
当 block 的个数从一个增加到两个,3个,...
smid的值为 0, 2, 4, ...直到偶数最大 max_even(smid), 然后是 1,3,5,... 直到奇数最大 max_odd(smid).
rtx 3060 2 SM/TPC
奇偶的变化,跟这个里的2是否有关系呢?以及有什么关系呢?