tx2+qt+cuda:线程块与线程学习(含源码)

7 篇文章 2 订阅
6 篇文章 0 订阅
该博客介绍了CUDA编程中线程块、线程、线程束和线程全局标号的概念,并提供了一个简单的CUDA内核程序示例来输出这些信息。通过配置不同的线程块和线程数,展示了CUDA程序如何组织和调度计算任务。此外,还给出了Qt环境下CUDA的安装步骤和项目配置,以及源码的详细解析。
摘要由CSDN通过智能技术生成

线程块

thread->一个cuda程序会被多个threads执行
block->多个threads组成
grid->多个block组成
warp->线程束,32个threads组成一个warp,是调度和运算的基本单元

准备工作
tx2+qt环境:https://blog.csdn.net/xx970829/article/details/112392488
tx2+cuda安装:https://blog.csdn.net/xx970829/article/details/112390718

源码

通过如下程序了解线程块的分配,用简短的内核程序来输出线程块、线程、线程束和线程全局标号到屏幕上:
kernel.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

extern "C" void what_is_my_id();

__global__ void my_id(
        unsigned int *const block,
        unsigned int *const thread,
        unsigned int *const warp,
        unsigned int *const calc_thread)
{
    const unsigned int thread_idx=(blockIdx.x*blockDim.x)+threadIdx.x;
    block[thread_idx]=blockIdx.x;
    thread[thread_idx]=threadIdx.x;
    warp[thread_idx]=threadIdx.x;
    calc_thread[thread_idx]=thread_idx;
}

// ----------------------------------------------------------------------------
#define ARRAY_SIZE 128
#define ARRAY_SIZE_IN_BYTES (ARRAY_SIZE*sizeof(int))
unsigned int cpu_block[ARRAY_SIZE];
unsigned int cpu_thread[ARRAY_SIZE];
unsigned int cpu_warp[ARRAY_SIZE];
unsigned int cpu_calc_thread[ARRAY_SIZE];

void what_is_my_id()
{
     //2*64=128
     const unsigned int num_blocks=2;
     const unsigned int num_threads=64;

     //gpu
     unsigned int *gpu_block;
     unsigned int *gpu_thread;
     unsigned int *gpu_warp;
     unsigned int *gpu_calc_thread;
     //
     unsigned int i;
     //cudaMalloc((void**)&dev_ary1, 32*sizeof(int));
     cudaMalloc((void **)&gpu_block,ARRAY_SIZE_IN_BYTES);
     cudaMalloc((void **)&gpu_thread,ARRAY_SIZE_IN_BYTES);
     cudaMalloc((void **)&gpu_warp,ARRAY_SIZE_IN_BYTES);
     cudaMalloc((void **)&gpu_calc_thread,ARRAY_SIZE_IN_BYTES);
     //kernel
     my_id<<<num_blocks,num_threads>>>(gpu_block,
                                       gpu_thread,
                                       gpu_warp,
                                       gpu_calc_thread);
     //gpu->cpu
     cudaMemcpy(cpu_block,gpu_block,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
     cudaMemcpy(cpu_thread,gpu_thread,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
     cudaMemcpy(cpu_warp,gpu_warp,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
     cudaMemcpy(cpu_calc_thread,gpu_calc_thread,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);
     //free
     cudaFree(gpu_block);
     cudaFree(gpu_thread);
     cudaFree(gpu_warp);
     cudaFree(gpu_calc_thread);
     //printf
     for(i=0;i<ARRAY_SIZE;i++)
     {
         printf("Calculated Thread:%3u - Block:%2u - Warp:%2u - Thread:%3u\n",
                cpu_calc_thread[i],cpu_block[i],cpu_warp[i],cpu_thread[i]);
     }
}

kernel.h

#ifndef KERNEL_H
#define KERNEL_H

extern "C" void what_is_my_id();

#endif // KERNEL_H

main.cpp

#include <iostream>
#include "kernel.h"

int main()
{
    std::cout << "--- This is a demo:" <<std::endl;
    what_is_my_id();
    std::cout << "--- Ending ... " <<std::endl;
    return 0;
}

pro配置

#-------------------------------------------------
#
# Project created by QtCreator 2021-01 
#
#-------------------------------------------------
QT      += core gui

greaterThan(QT_MAJOR_VERSION, 4): QT += widgets

TARGET = cuda_xianchengkuai

TEMPLATE = app
CONFIG  += console c++11
 

# The following define makes your compiler emit warnings if you use
# any feature of Qt which has been marked as deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS

# You can also make your code fail to compile if you use deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0


SOURCES += \
        main.cpp

HEADERS += \
    kernel.h


CUDA_SOURCES +=./kernel.cu
#--cuda 9.0------------------------------------------------------------------------------------------------
OBJECTS_DIR = ./debug__
CUDA_OBJECTS_DIR = ./debug__

CUDA_SDK = "/usr/local/cuda-9.0/"   # Path to cuda SDK install
CUDA_DIR = "/usr/local/cuda-9.0/"   # Path to cuda toolkit install

# DO NOT EDIT BEYOND THIS UNLESS YOU KNOW WHAT YOU ARE DOING....
SYSTEM_NAME = ubuntu        # Depending on your system either 'Win32', 'x64', or 'Win64'
SYSTEM_TYPE = 64            # '32' or '64', depending on your system
CUDA_ARCH = sm_62           # Type of CUDA architecture,
                            # for example 'compute_10', 'compute_11', 'sm_10'
NVCC_OPTIONS = --use_fast_math


# include paths
INCLUDEPATH += $$CUDA_DIR/include

# library directories
QMAKE_LIBDIR += $$CUDA_DIR/lib64/

# Add the necessary libraries
CUDA_LIBS = -lcuda -lcudart

# The following makes sure all path names (which often include spaces)
# are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
LIBS += $$CUDA_LIBS

# Configuration of the Cuda compiler
CONFIG(debug, debug|release) {
    # Debug mode
    cuda.input  = CUDA_SOURCES
    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
    cuda.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS \
                    $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE \
                    -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
    cuda.dependency_type = TYPE_C
    QMAKE_EXTRA_COMPILERS += cuda
}
else {
    # Release mode
    cuda.input = CUDA_SOURCES
    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS \
                    $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE \
                    -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
    cuda.dependency_type = TYPE_C
    QMAKE_EXTRA_COMPILERS += cuda
} 

结果分析

1个线程块,128个线程
在这里插入图片描述
2个线程块,128个线程,每个线程块64个线程
在这里插入图片描述
4个线程块,128个线程,每个线程块32个线程
在这里插入图片描述

参考:《CUDA并行程序设计,gpu编程指南》

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值