【从0开始,非常重要】CLion CUDA开发环境 整理

CLion CUDA开发环境 整理 【从0开始,非常重要】

2024年3月10日

1、下载Clion最新版

2、下载VS2022版

3、配置Toolchains:

image-20240310191449020

这个以后玩:

image-20240310191603375

4、配置Cmake

image-20240310191654262

CUDA的代码,Options 这么写(不同的项目,会用到不同的nvcc,这一点需要注意奥):

-DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe"

5、来一个CUDA helloword(单个文件!!!)

1、CMakeLists.tx


cmake_minimum_required(VERSION 3.27)
project(untitled CUDA)

set(CMAKE_CUDA_STANDARD 11)

add_executable(untitled main.cu)

set_target_properties(untitled PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON)

2、Cmake参数

image-20240310214318645

3、main.cu

#include <iostream>

int main() {
  std::cout << "Hello, World!" << std::endl;
  system("pause");
  return 0;
}

4、编译:

image-20240310192249771

6、再来一个CUDA 矩阵加法(单CU工程)

1\ CMakeLists.tx


cmake_minimum_required(VERSION 3.27)
project(untitled CUDA)

set(CMAKE_CUDA_STANDARD 11)

set(INC_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/include/")
set(LINK_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/lib/x64/")
include_directories(${INC_DIR})
link_directories(${LINK_DIR})
link_libraries(cufft nppial nppist nvml cufftw nppicc nppisu nvrtc curand nppitc OpenCL cuda cusolver nppidei npps cudadevrt cusparse nppif nvblas cudart nppc nppig cudart_static nppim)

add_executable(untitled main.cu)

set_target_properties(untitled PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON)

2\ **** Cmake参数 很重要

image-20240310214219827

-DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe"

3\ main.cu

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <npp.h>
#include<time.h>

__global__ void addKernel(int *A, int *B, int *C, int N) {
  int i = blockIdx.y * blockDim.y + threadIdx.y;
  int j = blockIdx.x * blockDim.x + threadIdx.x;
  printf("i is %d, j is %d\n", i, j);

  if (i < N && j < N) {
    C[i * N + j] = A[i * N + j] + B[i * N + j];
  }
}

void matrixAdd(int *A, int *B, int *C, int N) {
  const int blockSize = 4;
  int TILE_WIDTH = 2; //
  dim3 dimGrid(N / TILE_WIDTH, N / TILE_WIDTH);  //定义一个Grid有多少个Block;

  dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);  //定义一个Blcok有多少个线程;

  int *d_A, *d_B, *d_C;
  cudaMalloc((void **) &d_A, N * N * sizeof(int));
  cudaMalloc((void **) &d_B, N * N * sizeof(int));
  cudaMalloc((void **) &d_C, N * N * sizeof(int));

  cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);
  addKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

  cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);
}

int main() {
  std::cout << "Hello, World!" << std::endl;

  const int N = 4;
  int h_A[N][N], h_B[N][N], h_C[N][N];

  // initialize h_A and h_B
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      h_A[i][j] = 1 * i * j;
      h_B[i][j] = 2 * i * j;
    }
  }

  std::cout << typeid(h_A).name() << std::endl;

  matrixAdd((int *) h_A, (int *) h_B, (int *) h_C, N);

  // wait cuda to finish
  cudaDeviceSynchronize();

  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      std::cout << h_C[i][j] << " ";
    }
    std::cout << std::endl;
  }


//  system("pause");
  return 0;
}

4\ 编译、结果

image-20240310214410450

7、再来一个CUDA矩阵加法(多cu、cpp文件)

1、CMakeLists.tx


cmake_minimum_required(VERSION 3.27)
project(untitled CUDA)

set(CMAKE_CUDA_STANDARD 11)

set(INC_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/include/")
set(LINK_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/lib/x64/")
include_directories(${INC_DIR})
link_directories(${LINK_DIR})
link_libraries(cufft nppial nppist nvml cufftw nppicc nppisu nvrtc curand nppitc OpenCL cuda cusolver nppidei npps cudadevrt cusparse nppif nvblas cudart nppc nppig cudart_static nppim)

add_executable(untitled main.cu
        foo.cu # 在编译的时候,只需要编译cpp文件、cu文件,或者添加lib文件。不需要编译头文件!!!
#        foo.cuh # 编译的是时候,添加或者不添加头文件,都没什么意义
)

set_target_properties(untitled PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON)

2、CMake编译参数

image-20240310215215898

3、main.cu

#include <iostream>
#include "foo.cuh"  // 在coding的时候,只要有头文件,就默认有了实现(实现可以在cpp里完成、在cu里完成、也可以在lib里完成) // 一定要把函数的声明include进来;

int main() {
  std::cout << "Hello, World!" << std::endl;

  const int N = 4;
  int h_A[N][N], h_B[N][N], h_C[N][N];

  // initialize h_A and h_B
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      h_A[i][j] = 1 * i * j;
      h_B[i][j] = 2 * i * j;
    }
  }

  std::cout << typeid(h_A).name() << std::endl;

  matrixAdd((int *) h_A, (int *) h_B, (int *) h_C, N);

  // wait cuda to finish
  cudaDeviceSynchronize();

  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      std::cout << h_C[i][j] << " ";
    }
    std::cout << std::endl;
  }

//  system("pause");
  return 0;
}

4、foo.cuh

#ifndef FOO_CUH
#define FOO_CUH

__global__ void addKernel(int *A, int *B, int *C, int N);
void matrixAdd(int *A, int *B, int *C, int N);

#endif // FOO_CUH

5、foo.cu

#include "foo.cuh"
#include "cuda_runtime.h"  
#include "stdio.h"  //函数在实现的时候,要用到很多依赖,在这里include进来!

__global__ void addKernel(int *A, int *B, int *C, int N) {
  int i = blockIdx.y * blockDim.y + threadIdx.y;
  int j = blockIdx.x * blockDim.x + threadIdx.x;
  printf("i is %d, j is %d\n", i, j);

  if (i < N && j < N) {
    C[i * N + j] = A[i * N + j] + B[i * N + j];
  }
}

void matrixAdd(int *A, int *B, int *C, int N) {
  const int blockSize = 4;
  int TILE_WIDTH = 2; //
  dim3 dimGrid(N / TILE_WIDTH, N / TILE_WIDTH);  //定义一个Grid有多少个Block;

  dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);  //定义一个Blcok有多少个线程;

  int *d_A, *d_B, *d_C;
  cudaMalloc((void **) &d_A, N * N * sizeof(int));
  cudaMalloc((void **) &d_B, N * N * sizeof(int));
  cudaMalloc((void **) &d_C, N * N * sizeof(int));

  cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);
  addKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

  cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);
}

6、编译:

image-20240310215501968

运行结果

image-20240310215522373

  • 9
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值