动态并行允许从GPU端启动核函数,它要求显卡计算能力大于3.5.
动态并行Qt Creator的工程配置与不使用动态并行时有较大差别,以linux系统为例
QT += core greaterThan(QT_MAJOR_VERSION, 4): QT += widgets TARGET = test TEMPLATE = app DEFINES += QT_DEPRECATED_WARNINGS
INCLUDEPATH +=/usr/local/cuda-10.1/include SOURCES += main.cpp HEADERAS +=algorithm.h LIBS +=-L/usr/local/cuda-10.1/lib64 \ -lcublas -lcuda -lcudadevrt \ -lcudart -lcudart_static -lcufft \ -lcufftw -lcurand -lcusolver -lcusparse
OTHER_FILES +=./algorithm.cu CUDA_SOURCES+=./algorithm.cu CUDA_SDK ="/usr/local/cuda-10.1" CUDA_DIR ="/usr/local/cuda-10.1" QMAKE_LIBDIR += $$CUDA_DIR/lib64 SYSTEM_TYPE = 64 CUDA_ARCH = sm_60 NVCCFLAGS = --use_fast_math CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"') CUDA_LIBS= -L/usr/local/cuda-10.1/lib64 \ -lcublas -lcuda -lcudadevrt \ -lcudart -lcudart_static -lcufft \ -lcufftw -lcurand -lcusolver -lcusparse CUDA_OBJECTS_DIR = ./ #重点 为了支持动态并行化,使用两步分离编译和链接的过程 cudaIntr.input = CUDA_SOURCES cudaIntr.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}.o cudaIntr.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$CUDA_LIBS --machine $$SYSTEM_TYPE \ -arch=$$CUDA_ARCH -std=c++11 -rdc=true -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}cudaIntr.dependency_type = TYPE_C cudaIntr.variable_out = CUDA_OBJ cudaIntr.variable_out += OBJECTS cudaIntr.clean = cudaIntrObj/*.o QMAKE_EXTRA_COMPILERS += cudaIntr
cuda.input = CUDA_OBJ cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_link.o cuda.commands = $$CUDA_DIR/bin/nvcc -arch=$$CUDA_ARCH -std=c++11 -dlink ${QMAKE_FILE_NAME} -o ${QMAKE_FILE_OUT} cuda.dependency_type = TYPE_C cuda.depend_command = $$CUDA_DIR/bin/nvcc -g -M $$CUDA_INC $$NVCCFLAGS ${QMAKE_FILE_NAME} QMAKE_EXTRA_COMPILERS += cuda 上述目前我也不知道具体原理,是通过网上很多资料搜集整理而成,自己测试是可以运行的,以后懂了再加。