前言
本文通过一个矩阵加法的例子来说明如何使用网格和块来组织线程。
使用块和线程建立矩阵索引
通常情况下,一个矩阵用行优先的方法在全局内存中进行线性存储。如下图所示,这是一个8*6的矩阵。
在一个矩阵加法和核函数中,一个线程通常被分配一个数据元素来处理。首先要使用块和线程索引从全局内存中访问指定的数据。 接下来学习需要管理3种索引:
- 线程和块索引;
- 矩阵中给定点的坐标;
- 全局线性内存中的偏移量;
对于一个给定的线程, 首先可以通过把线程和块索引映射到矩阵坐标上来获取线程块和线程索引的全局内存偏移量, 然后将这些矩阵坐标映射到全局内存的存储单元中。
第一步, 可以用以下公式把线程和块索引映射到矩阵坐标上:
ix = threadIdx.x + blockDim.x*blockIdx.x;
iy = threadIdx.y + blockDim.y*blockIdx.y;
第二步, 可以用以下公式把矩阵坐标映射到全局内存中的索引/存储单元上:
idx = iy * nx + ix;
如下图说明了块和线程索引、 矩阵坐标以及线性全局内存索引之间的对应关系。
使用二维网格和二维块对矩阵求和
使用一个二维网格和二维块来编写一个矩阵加法核函数,代码如下:
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
// 随机初始化数组
void initialInt(float *ip, float size)
{
for (int i = 0; i < size; i++)
{
ip[i] = (float)(rand() & 0xff) / 66.6;
}
}
// 打印数组
void printMatrix(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A, *ib = B, *ic = C;
printf("\nMatrix:(%d, %d)\n", nx, ny);
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
printf("%f + %f = %f ", ia[ix], ib[ix], ic[ix]);
}
ia += nx;
ib += nx;
ic += nx;
printf("\n");
}
printf("\n");
}
// 验证结果
void printResult(float *C, float *CC, const int nx, const int ny)
{
float *ic = C, *icc = CC;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
printf("%f ", ic[ix]-icc[ix]);
}
ic += nx;
icc += nx;
printf("\n");
}
printf("\n");
}
// CPU:计算C=A+B
void sumMatrixOnHost(float *A, float*B, float*C, const int nx, const int ny)
{
float *ia = A, *ib = B, *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
}
// GPU:计算C=A+B
__global__ void sumMatrixOnDevice(float *MatA, float *MatB, float *MatC, const int nx, const int ny)
{
int ix = threadIdx.x + blockDim.x*blockIdx.x;
int iy = threadIdx.y + blockDim.y*blockIdx.y;
unsigned int idx = iy * nx + ix;
//unsigned int t_n = gridDim.x*blockDim.x + gridDim.y*blockDim.y;
if (ix < nx && iy < ny)
{
MatC[idx] = MatA[idx] + MatB[idx];
//idx += t_n; // 一个块用多次时
}
}
int main(int argc, char **argv)
{
//printf("%s Starting...\n", argv[10]);
// get device information
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("Using Device %d: %s\n\n", dev, deviceProp.name);
// set matrix dimension
int nx = 1<<14;
int ny = 1<<14;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
// malloc host dimension
float *h_A, *h_B, *h_C, *h_CC;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
h_CC = (float *)malloc(nBytes);
// initialize host matrix with integer
initialInt(h_A, nxy);
initialInt(h_B, nxy);
// 开始计时
clock_t cpuStart = clock();
sumMatrixOnHost(h_A, h_B, h_C, nx, ny);
// 结束计时
clock_t cpuEnd = clock();
float cpuTime = (float)(cpuEnd - cpuStart) / CLOCKS_PER_SEC;
printf("cpu time:%f\n", cpuTime);
//printMatrix(h_A, h_B, h_C, nx, ny);
// mallox device memory
float *d_MatA, *d_MatB, *d_MatC;
cudaMalloc((void **)&d_MatA, nBytes);
cudaMalloc((void **)&d_MatB, nBytes);
cudaMalloc((void **)&d_MatC, nBytes);
// 开始计时
clock_t gpuStart = clock();
// transfer data from host to device
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
//set up execution configuration
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
//dim3 grid(2, 2);
// invoke the kernal
sumMatrixOnDevice << <grid, block >> > (d_MatA, d_MatB, d_MatC, nx, ny);
cudaDeviceSynchronize();
// transfer data from device to host
cudaMemcpy(h_CC, d_MatC, nBytes, cudaMemcpyDeviceToHost);
// 结束计时
clock_t gpuEnd = clock();
float gpuTime = (float)(gpuEnd - gpuStart) / CLOCKS_PER_SEC;
printf("gpu time:%f\n", gpuTime);
//printMatrix(h_A, h_B, h_CC, nx, ny);
//printResult(h_C, h_CC, nx, ny);
// free host and device memory
cudaFree(d_MatA);
cudaFree(d_MatB);
cudaFree(d_MatC);
free(h_A);
free(h_B);
free(h_C);
// reset device
cudaDeviceReset();
return 0;
}
运行结果如下: