1D Grid, 1D Block 向量加法
普通实现
#include <stdio.h>
#include <math.h>
#define N 100
const double EPS = 1E-6;
void __global__ add(const double *x, const double *y, double *z, int n) {
// 获取全局索引
const int index = blockDim.x * blockIdx.x + threadIdx.x;
// 步长
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
z[i] = x[i] + y[i];
}
}
// 误差检测
void check(const double *z, const int n) {
bool error = false;
double maxError = 0;
for (int i = 0; i < n; i++) {
maxError = fmax(maxError, fabs(z[i]-70));
if (fabs(z[i] - 70) > EPS) {
error = true;
}
}
printf("%s\n", error ? "Errors" : "Pass");
printf("最大误差: %lf\n", maxError);
}
int main() {
const int arraySize = sizeof(double) * N;
// 申请host锁定内存
double *h_x, *h_y, *h_z;
cudaMallocHost(&h_x, arraySize);
cudaMallocHost(&h_y, arraySize);
cudaMallocHost(&h_z, arraySize);
// 初始化数据
for (int i = 0; i < N; i++) {
h_x[i] = 50;
h_y[i] = 20;
}
// 申请device显存
double *d_x, *d_y, *d_z;
cudaMalloc((void **)&d_x, arraySize);
cudaMalloc((void **)&d_y, arraySize);
cudaMalloc((void **)&d_z, arraySize);
// host数据传输到device
cudaMemcpy(d_x, h_x, arraySize, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, arraySize, cudaMemcpyHostToDevice);
// 核函数执行配置
dim3 blockSize(128);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
// 执行核函数
add<<<gridSize, blockSize>>>(d_x, d_y, d_z, N);
// 将device得到的结果传输到host
cudaMemcpy(h_z, d_z, arraySize, cudaMemcpyDeviceToHost);
// 检查执行结果
check(h_z, N);
// 释放host锁定内存
cudaFreeHost(h_x);
cudaFreeHost(h_y);
cudaFreeHost(h_z);
// 释放device显存
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
return 0;
}
统一内存实现
#include <stdio.h>
#include <math.h>
#define N 100
const double EPS = 1E-6;
__global__ void add(const double *x, const double *y, double *z, int n) {
// 获取全局索引
const int index = blockDim.x * blockIdx.x + threadIdx.x;
// 步长
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
z[i] = x[i] + y[i];
}
}
void check(const double *z, const int n) {
bool error = false;
double maxError = 0;
// 误差检测
for (int i = 0; i < n; i++) {
maxError = fmax(maxError, fabs(z[i]-70));
if (fabs(z[i] - 70) > EPS) {
error = true;
}
}
printf("%s\n", error ? "Errors" : "Pass");
printf("最大误差: %lf\n", maxError);
}
int main() {
// 申请统一内存
const int arraySize = sizeof(double) * N;
double *x, *y, *z;
cudaMallocManaged((void**)&x, arraySize);
cudaMallocManaged((void**)&y, arraySize);
cudaMallocManaged((void**)&z, arraySize);
// 初始化数据
for (int i = 0; i < N; i++) {
x[i] = 50;
y[i] = 20;
}
// 核函数执行配置
dim3 blockSize(128);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
// 执行核函数
add<<<gridSize, blockSize>>>(x, y, z, N);
// 同步函数
cudaDeviceSynchronize();
// 检查执行结果
check(z, N);
// 释放统一内存
cudaFree(x);
cudaFree(y);
cudaFree(z);
return 0;
}
2D Grid, 2D Block 矩阵乘法
普通实现
#include <stdio.h>
#include <math.h>
#define BLOCK_SIZE 16
const double EPS = 1E-6;
// GPU 矩阵乘法
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k) {
// 当前线程在所有线程中的索引坐标,即结果矩阵中的行与列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if (row < m && col < k) {
for(int i = 0; i < n; i++) {
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
// CPU 矩阵乘法
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
int sum = 0;
for (int h = 0; h < n; h++) {
sum += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = sum;
}
}
}
// 检查执行结果
void check(int *h_c_cpu, int *h_c_gpu, int m, int k) {
int error = false;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
if(fabs(h_c_gpu[i * k + j] - h_c_cpu[i * k + j]) > EPS) {
error = true;
}
}
}
printf("检查结果:%s\n", error ? "Errors" : "Pass");
}
int main() {
int m = 5;
int n = 5;
int k = 5;
// 申请统一内存
int *a, *b, *c_gpu, *c_cpu;
cudaMallocManaged((void **) &a, sizeof(int) * m * n);
cudaMallocManaged((void **) &b, sizeof(int) * n * k);
cudaMallocManaged((void **) &c_gpu, sizeof(int) * m * k);
cudaMallocManaged((void **) &c_cpu, sizeof(int) * m * k);
// 初始化数据
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < k; j++) {
b[i * k + j] = rand() % 1024;
}
}
// 核函数执行配置
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 grid(grid_cols, grid_rows);
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
// 执行核函数
gpu_matrix_mult<<<grid, block>>>(a, b, c_gpu, m, n, k);
// 同步函数
cudaDeviceSynchronize();
// 输出GPU结果
printf("GPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_gpu[i * k + j]);
}
printf("\n");
}
// cpu矩阵乘积
cpu_matrix_mult(a, b, c_cpu, m, n, k);
// 输出CPU结果
printf("CPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_cpu[i * k + j]);
}
printf("\n");
}
// 检查执行结果
check(c_cpu, c_gpu, m, k);
// 释放统一内存
cudaFree(a);
cudaFree(b);
cudaFree(c_cpu);
cudaFree(c_gpu);
return 0;
}
共享内存优化实现
在处理矩阵乘法时,假设矩阵 M M M 为 m ∗ n m*n m∗n 维,矩阵 N N N 为 n ∗ k n*k n∗k 维,得到的矩阵 P P P 为 m ∗ k m*k m∗k 维
举例来说,当需要读取矩阵
M
M
M 中的一个数值
M
(
row
,
col
)
M(\text{row}, \text{col})
M(row,col) 时,就要被 grid 中所有满足 row = blockIdx.y * blockDim.y + threadIdx.y
的线程从全局内存中读取一次,总共要读取
n
n
n 次
那么,对于这么多次的重复读取,可以将这个变量存放在共享内存中,从而减少每次的读取时间
#include <stdio.h>
#include <math.h>
#define BLOCK_SIZE 16
const double EPS = 1E-6;
// GPU 矩阵乘法,使用 shared memory
__global__ void gpu_matrix_mult_shared(int *a,int *b, int *c, int m, int n, int k) {
// 当前线程在所有线程中的索引坐标,即结果矩阵中的行与列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// 申请共享内存,存于每个block中
__shared__ int s_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int s_b[BLOCK_SIZE][BLOCK_SIZE];
int sum = 0;
// 枚举所有的grid,sub * BLOCK_SIZE为第sub个grid前的所有grid中的所有block所占空间
for (int sub = 0; sub < gridDim.x; sub++) {
/** index_a:该线程要读取的数据在矩阵a中的索引
* - sub * BLOCK_SIZE + threadIdx.x:该线程要读取的数据所在行
* - row*n:该线程所在行之前的所有数据
**/
int index_a = (sub * BLOCK_SIZE + threadIdx.x) + row * n;
if (row < m && (sub * BLOCK_SIZE + threadIdx.x) < n) {
s_a[threadIdx.y][threadIdx.x] = a[index_a];
} else {
s_a[threadIdx.y][threadIdx.x] = 0;
}
/** index_b:该线程要读取的数据在矩阵b中的索引
* - col:该线程要读取的数据所在行
* - n * (sub * BLOCK_SIZE + threadIdx.y):该线程所在行之前的所有数据
**/
int index_b = col + n * (sub * BLOCK_SIZE + threadIdx.y);
if (col < n && (sub * BLOCK_SIZE + threadIdx.y) < k) {
s_b[threadIdx.y][threadIdx.x] = b[index_b];
} else {
s_b[threadIdx.y][threadIdx.x] = 0;
}
// 控制线程同步,保证共享变量中的元素都被加载
__syncthreads();
// 从共享空间中取元素计算
for (int i = 0; i < BLOCK_SIZE; i++) {
sum += s_a[threadIdx.y][i] * s_b[i][threadIdx.x];
}
__syncthreads();
if (row < m && col < k) {
c[row * k + col] = sum;
}
}
}
// CPU 矩阵乘法
void cpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
int sum = 0;
for (int h = 0; h < n; h++) {
sum += a[i * n + h] * b[h * k + j];
}
c[i * k + j] = sum;
}
}
}
// 检查执行结果
void check(int *c_cpu, int *c_gpu, int m, int k) {
int error = false;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
if(fabs(c_gpu[i * k + j] - c_cpu[i * k + j]) > EPS) {
error = true;
}
}
}
printf("检查结果:%s\n", error ? "Errors" : "Pass");
}
int main() {
int m = 5;
int n = 5;
int k = 5;
// 申请统一内存
int *a, *b, *c_gpu, *c_cpu;
cudaMallocManaged((void **) &a, sizeof(int) * m * n);
cudaMallocManaged((void **) &b, sizeof(int) * n * k);
cudaMallocManaged((void **) &c_gpu, sizeof(int) * m * k);
cudaMallocManaged((void **) &c_cpu, sizeof(int) * m * k);
// 初始化数据
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < k; j++) {
b[i * k + j] = rand() % 1024;
}
}
// 核函数执行配置
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 grid(grid_cols, grid_rows);
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
// 执行核函数
gpu_matrix_mult_shared<<<grid, block>>>(a, b, c_gpu, m, n, k);
// 同步函数
cudaDeviceSynchronize();
// 输出GPU结果
printf("GPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_gpu[i * k + j]);
}
printf("\n");
}
// cpu矩阵乘积
cpu_matrix_mult(a, b, c_cpu, m, n, k);
// 输出CPU结果
printf("CPU执行结果:\n");
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
printf("%d ", c_cpu[i * k + j]);
}
printf("\n");
}
// 检查执行结果
check(c_cpu, c_gpu, m, k);
// 释放统一内存
cudaFree(a);
cudaFree(b);
cudaFree(c_cpu);
cudaFree(c_gpu);
return 0;
}