参加cuda线上训练营学习笔记-2

xiuyuebaby

已于 2023-02-09 19:57:58 修改

阅读量155

点赞数

文章标签：学习 python 开发语言

于 2023-02-07 22:38:13 首次发布

本文链接：https://blog.csdn.net/qq_47465913/article/details/128926593

版权

存储单元

GPU中有6种内存，Thread对其有着不同的权限，所处的位置也不同，如下表：

	Global Memory	Constant Memory	Texture Memory	Local Memory	Share Memory	Registers
RW	R/W	R	R	R/W	R/W	R/W
Locate	Grid	Grid	Grid	Block	Block	Block
Host RW	R/W	R/W	R/W	/	/	/

内存的使用类似于cpu中的malloc()等函数：

内存的申请：

cudaMalloc()

内存大小设置：

cudaMemset()

内存释放：

cudaFree()

cpu和gpu内存间复制：

cudaMemcpy(void* dst,const void* src,size_t count,cudaMemcpyKind kind)

实例：矩阵相乘

现有M，N两个矩阵，P矩阵=M*N：

如使用cpu进行串行计算，需使用3*3*3=27次，代码如下：

void cpu_matrix_mult(int *h_m,int *h_n,int *h_result,int m,int n,int k){
    for (inti=0;i<m;++i){
        for (intj=0;j<k;++j){
            int tmp=0.0;
            for (int h=0;h n;++h){
                tmp+=h_m[i*n+h]*h_n[h*k+j];
            }
            h_result[i k+j]=tmp;
        }
    }
}

使用gpu则可以大大节省运行时间：

#include <stdio.h>
#include <math.h>

#define BLOCK_SIZE 16

__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{ 
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;
    if( col < k && row < m) 
    {
        for(int i = 0; i < n; i++) 
        {
            sum += a[row * n + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
} 


int main(int argc, char const *argv[])
{
    int m=100;
    int n=100;
    int k=100;

    int *h_a, *h_b, *h_c, *h_cc;
    //创建内存空间
    cudaMallocHost((void **) &h_a, sizeof(int)*m*n);
    cudaMallocHost((void **) &h_b, sizeof(int)*n*k);
    cudaMallocHost((void **) &h_c, sizeof(int)*m*k);
    cudaMallocHost((void **) &h_cc, sizeof(int)*m*k);

    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            h_a[i * n + j] = rand() % 1024;
        }
    }

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < k; ++j) {
            h_b[i * k + j] = rand() % 1024;
        }
    }
    //在device端开辟空间准备复制cpu内存的数据
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **) &d_a, sizeof(int)*m*n);
    cudaMalloc((void **) &d_b, sizeof(int)*n*k);
    cudaMalloc((void **) &d_c, sizeof(int)*m*k);

    // copy matrix A and B from host to device memory
    cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice);

    unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
   
    gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);    

    cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
    //cudaThreadSynchronize();

    cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);

    int ok = 1;
    for (int i = 0; i < m; ++i)
    {
        for (int j = 0; j < k; ++j)
        {
            if(fabs(h_cc[i*k + j] - h_c[i*k + j])>(1.0e-10))
            {
                
                ok = 0;
            }
        }
    }

    if(ok)
    {
        printf("Pass!!!\n");
    }
    else
    {
        printf("Error!!!\n");
    }

    // free memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    cudaFreeHost(h_cc);
    return 0;
}