0. 思路
为了能把理念说通,使用了 step by step 的方式,一步步迭代会觉得比较合理。源代码从nv官方vectorAdd改过来的。
step 1, 单 cu 文件的可执行文件版本
源代码
main_app.cu
#include <stdio.h>
#include <cuda_runtime.h>
template <typename T>
__global__ void vector_square_add(T *A, T *B, T *C, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] * A[i] + B[i] * B[i];
}
}
template __global__ void vector_square_add(float *A, float *B, float *C, int n);
template <typename T>
__global__ void vector_add_kernel(T *A, T *B, T *C, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[i] + 0.0f;
}
}
template __global__ void vector_add_kernel(float *A, float *B, float *C, int n);
template <typename T>
void ic_vector_add(T *A, T *B, T *C, int n)
{
dim3 grid, block;
block.x = 256;
grid.x = (n + block.x - 1) / block.x;
printf("CUDA kernel launch with %d blocks of %d threads\n", grid.x, block.x);
vector_add_kernel<T><<<grid, block>>>(A, B, C, n);
}
template void ic_vector_add(float* A, float *B, float* C, int n);
int main(void)
{
int n = 50;
size_t size = n * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
for (int i = 0; i < n; ++i)
{
h_A[i] = 3; // rand() / (float)RAND_MAX;
h_B[i] = 4; // rand() / (float)RAND_MAX;
}
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
/*
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vector_add_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, n);
*/
ic_vector_add(d_A, d_B, d_C, n);
printf("Copy output data from the CUDA device to the host memory\n");
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < n; ++i)
{
printf("%3.2f ", h_C[i]);
// if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); }
}
printf("\nTest PASSED\n");
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
Makefile
all: main_
main_.cu:main.cu
cp main.cu main_.cu
main_.o:main_.cu
nvcc $< -c --keep
main_:main_.o
nvcc $< -o $@
.PHONY:clean
clean:
-rm -f main_*
step2, 一个 API 函数的动态链接库 Makefile 版本
文件结构和内容稍微做了一些改变,在 libicmm.so的cuda 和cpp代码中暂时没有使用 模板。
文件目录如下:
包括两个Makefile在内,涉及到7个源文件,从上到下现罗列如下,假设顶层目录为icmm_top/
内含
icmm_top/bin
icmm_top/gpu/add.cu
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void vector_add_kernel(float *A, float *B, float *C, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[i] + 0.0f;
}
}
extern "C" void vector_add_gpu(float *A, float *B, float *C, int n)
{
dim3 grid, block;
block.x = 256;
grid.x = (n + block.x - 1) / block.x;
printf("CUDA kernel launch with %d blocks of %d threads\n", grid.x, block.x);
vector_add_kernel<<<grid, block>>>(A, B, C, n);
}
icmm_top/gpu/add.h
#pragma once
extern "C" void vector_add_gpu(float *A, float *B, float *C, int n);
icmm_top/include/ic_add.h
#pragma once
#include<cuda_runtime.h>
void hello_print();
void ic_add(float* A, float* B, float *C, int n);
icmm_top/makefile_bin
# executable
TARGET = test
all: $(TARGET)
add.o: gpu/add.cu
nvcc -dc -rdc=true -arch=sm_70 -c gpu/add.cu
add_link.o: add.o
nvcc -arch=sm_70 -dlink -o add_link.o add.o -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt
ic_add.o: src/ic_add.cpp
g++ -c src/ic_add.cpp -L/usr/local/cuda-11.4/lib64 -I/usr/local/cuda-11.4/include -lcudart -lcudadevrt -I./
test.o: testing/test.cpp
g++ -c testing/test.cpp -I/usr/local/cuda-11.4/include -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt -I./include
test: add.o ic_add.o test.o add_link.o
g++ add.o ic_add.o test.o add_link.o -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt -o test
cp ./test ./bin/
.PHONY:clean
clean:
-rm -f *.o bin/* $(TARGET)
icmm_top/Makefile
#libicmm.so
TARGETS = libicmm.so
all: $(TARGETS)
add.o: gpu/add.cu
nvcc -Xcompiler -fPIC -arch=sm_70 -c $<
#-dc
#-rdc=true
add_link.o: add.o
nvcc -Xcompiler -fPIC -arch=sm_70 -dlink -o $@ $< -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt
ic_add.o: src/ic_add.cpp
g++ -fPIC -c $< -L/usr/local/cuda-11.4/lib64 -I/usr/local/cuda-11.4/include -lcudart -lcudadevrt -I./
$(TARGETS): add.o ic_add.o add_link.o
g++ -shared -fPIC $^ -o lib/libicmm.so -I/usr/local/cuda-11.4/include -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt
-rm -f *.o
.PHONY:clean
clean:
-rm -f *.o lib/*.so test ./bin/test
icmm_top/src/ic_add.cpp
#include <stdio.h>
#include <cuda_runtime.h>
#include "gpu/add.h"
//extern void vector_add_gpu(float *A, float *B, float *C, int n);
void hello_print()
{
printf("hello world!\n");
}
void ic_add(float* A, float* B, float *C, int n)
{
vector_add_gpu(A, B, C, n);
}
icmm_top/testing/Makefile
#test
TARGET = test
all: $(TARGET)
CXX_FLAGS = -I/usr/local/cuda-11.4/include -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt -I../include -L../
test.o: test.cpp
g++ -c $< $(CXX_FLAGS)
$(TARGET):test.o
g++ $< -o $@ -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt -L../lib -licmm
.PHONY:clean
clean:
-rm -f *.o $(TARGET)
icmm_top/testing/test.cpp
#include "ic_add.h"
#include <cuda_runtime.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main(void)
{
int n = 50;
size_t size = n * sizeof(float);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
for (int i = 0; i < n; ++i)
{
h_A[i] = 3; // rand() / (float)RAND_MAX;
h_B[i] = 4; // rand() / (float)RAND_MAX;
}
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
/*
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vector_add_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, n);
*/
ic_add(d_A, d_B, d_C, n);
printf("Copy output data from the CUDA device to the host memory\n");
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < n; ++i)
{
printf("%3.2f ", h_C[i]);
// if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); }
}
printf("\nTest PASSED\n");
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
唯一需要注意的是nvcc用了两次,特别有一次使用了 -dlink 选项:
add.o: gpu/add.cu
nvcc -dc -rdc=true -arch=sm_70 -c gpu/add.cu
add_link.o: add.o
nvcc -arch=sm_70 -dlink -o add_link.o add.o -L/usr/local/cuda-11.4/lib64 -lcudart -lcudadevrt
运行效果图:
下一篇内容: