线程块
thread->一个cuda程序会被多个threads执行
block->多个threads组成
grid->多个block组成
warp->线程束,32个threads组成一个warp,是调度和运算的基本单元
准备工作
tx2+qt环境:https://blog.csdn.net/xx970829/article/details/112392488
tx2+cuda安装:https://blog.csdn.net/xx970829/article/details/112390718
源码
通过如下程序了解线程块的分配,用简短的内核程序来输出线程块、线程、线程束和线程全局标号到屏幕上:
kernel.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
extern "C" void what_is_my_id();
__global__ void my_id(
unsigned int *const block,
unsigned int *const thread,
unsigned int *const warp,
unsigned int *const calc_thread)
{
const unsigned int thread_idx=(blockIdx.x*blockDim.x)+threadIdx.x;
block[thread_idx]=blockIdx.x;
thread[thread_idx]=threadIdx.x;
warp[thread_idx]=threadIdx.x;
calc_thread[thread_idx]=thread_idx;
}
// ----------------------------------------------------------------------------
#define ARRAY_SIZE 128
#define ARRAY_SIZE_IN_BYTES (ARRAY_SIZE*sizeof(int))
unsigned int cpu_block[ARRAY_SIZE];
unsigned int cpu_thread[ARRAY_SIZE];
unsigned int cpu_warp[ARRAY_SIZE];
unsigned int cpu_calc_thread[ARRAY_SIZE];
void what_is_my_id()
{
//2*64=128
const unsigned int num_blocks=2;
const unsigned int num_threads=64;
//gpu
unsigned int *gpu_block;
unsigned int *gpu_thread;
unsigned int *gpu_warp;
unsigned int *gpu_calc_thread;
//
unsigned int i;
//cudaMalloc((void**)&dev_ary1, 32*sizeof(int));
cudaMalloc((void **)&gpu_block,ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_thread,ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_warp,ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_calc_thread,ARRAY_SIZE_IN_BYTES);
//kernel
my_id<<<num_blocks,num_threads>>>(gpu_block,
gpu_thread,
gpu_warp,
gpu_calc_thread);
//gpu->cpu
cudaMemcpy(cpu_block,gpu_block,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_thread,gpu_thread,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_warp,gpu_warp,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_calc_thread,gpu_calc_thread,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
//free
cudaFree(gpu_block);
cudaFree(gpu_thread);
cudaFree(gpu_warp);
cudaFree(gpu_calc_thread);
//printf
for(i=0;i<ARRAY_SIZE;i++)
{
printf("Calculated Thread:%3u - Block:%2u - Warp:%2u - Thread:%3u\n",
cpu_calc_thread[i],cpu_block[i],cpu_warp[i],cpu_thread[i]);
}
}
kernel.h
#ifndef KERNEL_H
#define KERNEL_H
extern "C" void what_is_my_id();
#endif // KERNEL_H
main.cpp
#include <iostream>
#include "kernel.h"
int main()
{
std::cout << "--- This is a demo:" <<std::endl;
what_is_my_id();
std::cout << "--- Ending ... " <<std::endl;
return 0;
}
pro配置
#-------------------------------------------------
#
# Project created by QtCreator 2021-01
#
#-------------------------------------------------
QT += core gui
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
TARGET = cuda_xianchengkuai
TEMPLATE = app
CONFIG += console c++11
# The following define makes your compiler emit warnings if you use
# any feature of Qt which has been marked as deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
# You can also make your code fail to compile if you use deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
SOURCES += \
main.cpp
HEADERS += \
kernel.h
CUDA_SOURCES +=./kernel.cu
#--cuda 9.0------------------------------------------------------------------------------------------------
OBJECTS_DIR = ./debug__
CUDA_OBJECTS_DIR = ./debug__
CUDA_SDK = "/usr/local/cuda-9.0/" # Path to cuda SDK install
CUDA_DIR = "/usr/local/cuda-9.0/" # Path to cuda toolkit install
# DO NOT EDIT BEYOND THIS UNLESS YOU KNOW WHAT YOU ARE DOING....
SYSTEM_NAME = ubuntu # Depending on your system either 'Win32', 'x64', or 'Win64'
SYSTEM_TYPE = 64 # '32' or '64', depending on your system
CUDA_ARCH = sm_62 # Type of CUDA architecture,
# for example 'compute_10', 'compute_11', 'sm_10'
NVCC_OPTIONS = --use_fast_math
# include paths
INCLUDEPATH += $$CUDA_DIR/include
# library directories
QMAKE_LIBDIR += $$CUDA_DIR/lib64/
# Add the necessary libraries
CUDA_LIBS = -lcuda -lcudart
# The following makes sure all path names (which often include spaces)
# are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
LIBS += $$CUDA_LIBS
# Configuration of the Cuda compiler
CONFIG(debug, debug|release) {
# Debug mode
cuda.input = CUDA_SOURCES
cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS \
$$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE \
-arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda
}
else {
# Release mode
cuda.input = CUDA_SOURCES
cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS \
$$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE \
-arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda
}
结果分析
1个线程块,128个线程
2个线程块,128个线程,每个线程块64个线程
4个线程块,128个线程,每个线程块32个线程
参考:《CUDA并行程序设计,gpu编程指南》