实验内容
- 写一个helloc_cuda.cu
#include <stdio.h>
__global__ void hello_from_gpu()
{
printf("Hello World from the GPU!\n");
}
int main(void)
{
hello_from_gpu<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
编译用/usr/local/cuda/bin/nvcc -arch=compute_53 -code=sm_53 hello_cuda.cu -o hello_cuda -run
,得到一个叫hello_cuda的执行文件,用./hello_cuda
即可执行这个文件。
另外,可以采用make的方式完成编译,Makefile这么写:
TEST_SOURCE = hello_cuda.cu
TARGETBIN := ./hello_cuda
CC = /usr/local/cuda/bin/nvcc
$(TARGETBIN):$(TEST_SOURCE)
$(CC) $(TEST_SOURCE) -o $(TARGETBIN)
.PHONY:clean
clean:
-rm -rf $(TARGETBIN)
-rm -rf *.o
- makefile编译多文件的Cuda程序
要编译hello_cuda02-test.cu,它引用的hello_from_gpu.cuh,hello_from_gpu.cu
makefile这么写:
TEST_SOURCE = hello_cuda02-test.cu
TARGETBIN := ./hello_cuda_multi_file
CC = /usr/local/cuda/bin/nvcc
$(TARGETBIN):hello_cuda02-test.cu hello_from_gpu.o
$(CC) $(TEST_SOURCE) hello_from_gpu.o -o $(TARGETBIN)
hello_from_gpu.o:hello_from_gpu.cu
$(CC) --device-c hello_from_gpu.cu -o hello_from_gpu.o
.PHONY:clean
clean:
-rm -rf $(TARGETBIN)
-rm -rf *.o
查看程序性能:
sudo /usr/local/cuda/bin/nvprof ./hello_cuda
回显中的Profiling result:是GPU(kernel函数)上运行的时间,API calls是在cpu上测量的程序调用API的时间。
- CUDA 线程组织
我们如何能够得到一个线程在所有的线程中的索引值?比如:我们申请了4个线程块,每个线程块有8个线程,那么我们就申请了32个线程,那么我需要找到第3个线程块(编号为2的block)里面的第6个线程(编号为5的thread)在所有线程中的索引值怎么办?
这时,我们就需要blockDim 和 gridDim这两个变量:
- gridDim表示一个grid中包含多少个block
- blockDim表示一个block中包含多少个线程
也就是说,在上面的那个例子中,gridDim.x=4, blockDim.x=8
那么,我们要找的第22个线程(编号为21)的唯一索引就应该是,index = blockIdx.x * blockDim.x + threadIdx.x
#include<stdio.h>
__global__ void printid(){
int threadId = threadIdx.x;
int blockId = blockIdx.x;
printf("Hello World from block %d and thread %d!\n", blockId, threadId);
}
int main()
{
printid<<<5,65>>>();
cudaDeviceSynchronize();
return 0;
}
编译运行,回显是:
- sobel.cu边缘检测kernel优化
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace std;
using namespace cv;
//GPU实现Sobel边缘检测
// x0 x1 x2
// x3 x4 x5
// x6 x7 x8
__global__ void sobel_gpu(unsigned char* in, unsigned char* out, int imgHeight, int imgWidth)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int index = y * imgWidth + x;
int Gx = 0;
int Gy = 0;
unsigned char x0, x1, x2, x3, x4, x5, x6, x7, x8;
if (x > 0 && x < imgWidth-1 && y>0 && y < imgHeight-1)
{
x0 = in[(y - 1) * imgWidth + x - 1];
x1 = in[(y - 1) * imgWidth + x ];
x2 = in[(y - 1) * imgWidth + x + 1];
x3 = in[(y) * imgWidth + x - 1];
x4 = in[(y ) * imgWidth + x ];
x5 = in[(y ) * imgWidth + x + 1];
x6 = in[(y + 1) * imgWidth + x - 1];
x7 = in[(y + 1) * imgWidth + x ];
x8 = in[(y + 1) * imgWidth + x + 1];
Gx = (x0 + x3 * 2 + x6) - (x2 + x5 * 2 + x8);
Gy = (x0 + x1 * 2 + x2) - (x6 + x7 * 2 + x8);
out[index] = (abs(Gx) + abs(Gy)) / 2;
}
}
//CPU实现Sobel边缘检测
void sobel_cpu(Mat srcImg, Mat dstImg, int imgHeight, int imgWidth)
{
int Gx = 0;
int Gy = 0;
for (int i = 1; i < imgHeight - 1; i++)
{
uchar* dataUp = srcImg.ptr<uchar>(i - 1);
uchar* data = srcImg.ptr<uchar>(i);
uchar* dataDown = srcImg.ptr<uchar>(i + 1);
uchar* out = dstImg.ptr<uchar>(i);
for (int j = 1; j < imgWidth - 1; j++)
{
Gx = (dataUp[j - 1] + 2 * data[j - 1] + dataDown[j - 1])-(dataUp[j + 1] + 2 * data[j + 1] + dataDown[j + 1]);
Gy = (dataUp[j - 1] + 2 * dataUp[j] + dataUp[j + 1]) - (dataDown[j - 1] + 2 * dataDown[j] + dataDown[j + 1]);
out[j] = (abs(Gx) + abs(Gy)) / 2;
}
}
}
int main()
{
//利用opencv的接口读取图片
Mat img = imread("1.jpg", 0);
int imgWidth = img.cols;
int imgHeight = img.rows;
//利用opencv的接口对读入的grayImg进行去噪
Mat gaussImg;
GaussianBlur(img, gaussImg, Size(3, 3), 0, 0, BORDER_DEFAULT);
//CPU结果为dst_cpu, GPU结果为dst_gpu
Mat dst_cpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));
Mat dst_gpu(imgHeight, imgWidth, CV_8UC1, Scalar(0));
//调用sobel_cpu处理图像
sobel_cpu(gaussImg, dst_cpu, imgHeight, imgWidth);
//申请指针并将它指向GPU空间
size_t num = imgHeight * imgWidth * sizeof(unsigned char);
unsigned char* in_gpu;
unsigned char* out_gpu;
cudaMalloc((void**)&in_gpu, num);
cudaMalloc((void**)&out_gpu, num);
//定义grid和block的维度(形状)
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((imgWidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
(imgHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
//将数据从CPU传输到GPU
cudaMemcpy(in_gpu, img.data, num, cudaMemcpyHostToDevice);
//调用在GPU上运行的核函数
sobel_gpu<<<blocksPerGrid,threadsPerBlock>>>(in_gpu, out_gpu, imgHeight, imgWidth);
//将计算结果传回CPU内存
cudaMemcpy(dst_gpu.data, out_gpu, num, cudaMemcpyDeviceToHost);
imwrite("save.png", dst_gpu);
//显示处理结果, 由于这里的Jupyter模式不支持显示图像, 所以我们就不显示了
//imshow("gpu", dst_gpu);
//imshow("cpu", dst_cpu);
//waitKey(0);
//释放GPU内存空间
cudaFree(in_gpu);
cudaFree(out_gpu);
return 0;
}
编译命令:
/usr/local/cuda/bin/nvcc sobel.cu -L /usr/lib/aarch64-linux-gnu/libopencv*.so -I /usr/include/opencv4 -o sobel
这里的so文件也可以先将LD_PRELOAD变量赋值为需要的.so文件,用预加载的方式使用so库文件。(LD_PRELOAD可以影响程序的运行时的链接,它允许你定义在程序运行前优先加载的动态链接库,这个功能主要就是用来有选择性的载入不同动态链接库中的相同函数。)
运行后的回显: