CLion CUDA开发环境 整理 【从0开始,非常重要】
2024年3月10日
1、下载Clion最新版
2、下载VS2022版
3、配置Toolchains:
这个以后玩:
4、配置Cmake
CUDA的代码,Options 这么写(不同的项目,会用到不同的nvcc,这一点需要注意奥):
-DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe"
5、来一个CUDA helloword(单个文件!!!)
1、CMakeLists.tx
cmake_minimum_required(VERSION 3.27)
project(untitled CUDA)
set(CMAKE_CUDA_STANDARD 11)
add_executable(untitled main.cu)
set_target_properties(untitled PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)
2、Cmake参数
3、main.cu
#include <iostream>
int main() {
std::cout << "Hello, World!" << std::endl;
system("pause");
return 0;
}
4、编译:
6、再来一个CUDA 矩阵加法(单CU工程)
1\ CMakeLists.tx
cmake_minimum_required(VERSION 3.27)
project(untitled CUDA)
set(CMAKE_CUDA_STANDARD 11)
set(INC_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/include/")
set(LINK_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/lib/x64/")
include_directories(${INC_DIR})
link_directories(${LINK_DIR})
link_libraries(cufft nppial nppist nvml cufftw nppicc nppisu nvrtc curand nppitc OpenCL cuda cusolver nppidei npps cudadevrt cusparse nppif nvblas cudart nppc nppig cudart_static nppim)
add_executable(untitled main.cu)
set_target_properties(untitled PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)
2\ **** Cmake参数 很重要
-DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe"
3\ main.cu
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <npp.h>
#include<time.h>
__global__ void addKernel(int *A, int *B, int *C, int N) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
printf("i is %d, j is %d\n", i, j);
if (i < N && j < N) {
C[i * N + j] = A[i * N + j] + B[i * N + j];
}
}
void matrixAdd(int *A, int *B, int *C, int N) {
const int blockSize = 4;
int TILE_WIDTH = 2; //
dim3 dimGrid(N / TILE_WIDTH, N / TILE_WIDTH); //定义一个Grid有多少个Block;
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); //定义一个Blcok有多少个线程;
int *d_A, *d_B, *d_C;
cudaMalloc((void **) &d_A, N * N * sizeof(int));
cudaMalloc((void **) &d_B, N * N * sizeof(int));
cudaMalloc((void **) &d_C, N * N * sizeof(int));
cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
int main() {
std::cout << "Hello, World!" << std::endl;
const int N = 4;
int h_A[N][N], h_B[N][N], h_C[N][N];
// initialize h_A and h_B
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
h_A[i][j] = 1 * i * j;
h_B[i][j] = 2 * i * j;
}
}
std::cout << typeid(h_A).name() << std::endl;
matrixAdd((int *) h_A, (int *) h_B, (int *) h_C, N);
// wait cuda to finish
cudaDeviceSynchronize();
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
std::cout << h_C[i][j] << " ";
}
std::cout << std::endl;
}
// system("pause");
return 0;
}
4\ 编译、结果
7、再来一个CUDA矩阵加法(多cu、cpp文件)
1、CMakeLists.tx
cmake_minimum_required(VERSION 3.27)
project(untitled CUDA)
set(CMAKE_CUDA_STANDARD 11)
set(INC_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/include/")
set(LINK_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/lib/x64/")
include_directories(${INC_DIR})
link_directories(${LINK_DIR})
link_libraries(cufft nppial nppist nvml cufftw nppicc nppisu nvrtc curand nppitc OpenCL cuda cusolver nppidei npps cudadevrt cusparse nppif nvblas cudart nppc nppig cudart_static nppim)
add_executable(untitled main.cu
foo.cu # 在编译的时候,只需要编译cpp文件、cu文件,或者添加lib文件。不需要编译头文件!!!
# foo.cuh # 编译的是时候,添加或者不添加头文件,都没什么意义
)
set_target_properties(untitled PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)
2、CMake编译参数
3、main.cu
#include <iostream>
#include "foo.cuh" // 在coding的时候,只要有头文件,就默认有了实现(实现可以在cpp里完成、在cu里完成、也可以在lib里完成) // 一定要把函数的声明include进来;
int main() {
std::cout << "Hello, World!" << std::endl;
const int N = 4;
int h_A[N][N], h_B[N][N], h_C[N][N];
// initialize h_A and h_B
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
h_A[i][j] = 1 * i * j;
h_B[i][j] = 2 * i * j;
}
}
std::cout << typeid(h_A).name() << std::endl;
matrixAdd((int *) h_A, (int *) h_B, (int *) h_C, N);
// wait cuda to finish
cudaDeviceSynchronize();
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
std::cout << h_C[i][j] << " ";
}
std::cout << std::endl;
}
// system("pause");
return 0;
}
4、foo.cuh
#ifndef FOO_CUH
#define FOO_CUH
__global__ void addKernel(int *A, int *B, int *C, int N);
void matrixAdd(int *A, int *B, int *C, int N);
#endif // FOO_CUH
5、foo.cu
#include "foo.cuh"
#include "cuda_runtime.h"
#include "stdio.h" //函数在实现的时候,要用到很多依赖,在这里include进来!
__global__ void addKernel(int *A, int *B, int *C, int N) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
printf("i is %d, j is %d\n", i, j);
if (i < N && j < N) {
C[i * N + j] = A[i * N + j] + B[i * N + j];
}
}
void matrixAdd(int *A, int *B, int *C, int N) {
const int blockSize = 4;
int TILE_WIDTH = 2; //
dim3 dimGrid(N / TILE_WIDTH, N / TILE_WIDTH); //定义一个Grid有多少个Block;
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); //定义一个Blcok有多少个线程;
int *d_A, *d_B, *d_C;
cudaMalloc((void **) &d_A, N * N * sizeof(int));
cudaMalloc((void **) &d_B, N * N * sizeof(int));
cudaMalloc((void **) &d_C, N * N * sizeof(int));
cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
6、编译:
运行结果