显卡驱动版本cuda12.2,运行版本12.0,安装pytorch版本对应cuda12.1
pycuda安装命令:
conda install -c conda-forge pycuda
pycuda测试:
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
# 定义核函数
mod = SourceModule(
"""
__global__ void add_vectors(float *a, float *b, float *c, int n)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n)
{
c[idx] = a[idx] + b[idx];
}
}
"""
)
# 定义向量大小
n = 10000
# 生成随机向量数据
a = numpy.random.randn(n).astype(numpy.float32)
b = numpy.random.randn(n).astype(numpy.float32)
# 分配输出内存空间
c = numpy.zeros_like(a)
# 将输入输出数据复制到 GPU
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
c_gpu = drv.mem_alloc(c.nbytes)
drv.memcpy_htod(a_gpu, a)
drv.memcpy_htod(b_gpu, b)
# 定义块和网格大小
blocksize = 256
gridsize = (n + blocksize - 1) // blocksize
# 执行核函数
add_vectors = mod.get_function("add_vectors")
add_vectors(
a_gpu, b_gpu, c_gpu, numpy.int32(n), block=(blocksize, 1, 1), grid=(gridsize, 1)
)
# 将结果从 GPU 复制回 CPU
drv.memcpy_dtoh(c, c_gpu)
# 检查计算结果是否正确
assert numpy.allclose(c, a + b), "result not correct"
# 输出结果
print("a:", a)
print("b:", b)
print("c:", c)
报错:
pycuda.driver.CompileError: nvcc compilation of /tmp/tmpjtjjpnp3/kernel.cu failed
[command: nvcc --cubin -arch sm_89 -I/home/ps/anaconda3/envs/pytorch-pu/lib/python3.10/site-packages/pycuda/cuda kernel.cu.]
[stderr:
nvcc fatal: Value 'sm_89' is not defined for option 'gpu-architecture!
]