实验内容
CUDA实现矩阵乘法
核心思想就是用空间换时间
这部分内容比较简单,不赘述,回显对比了用cpu进行计算和用gpu计算的矩阵乘法结果是否一致:
检错和计时
Cuda编程模型中的事件。事件的本质就是一个标记,它与其所在的流内的特定点相关联。可以使用时间来执行以下两个基本任务:
- 同步流执行
- 监控设备的进展
流中的任意点都可以通过API插入事件以及查询事件完成的函数,只有事件所在流中其之前的操作都完成后才能触发事件完成。默认流中设置事件,那么其前面的所有操作都完成时,事件才出发完成。 事件就像一个个路标,其本身不执行什么功能,就像我们最原始测试c语言程序的时候插入的无数多个printf一样。
声明:
cudaEvent_t event;
创建:
cudaError_t cudaEventCreate(cudaEvent_t* event);
销毁:
cudaError_t cudaEventDestroy(cudaEvent_t event);
添加事件到当前执行流:
cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream = 0);
等待事件完成,设立flag:
cudaError_t cudaEventSynchronize(cudaEvent_t event);//阻塞
cudaError_t cudaEventQuery(cudaEvent_t event);//非阻塞
当然,我们也可以用它来记录执行的事件:
cudaError_t cudaEventElapsedTime(float* ms, cudaEvent_t start, cudaEvent_t stop);
下面来测试一下核函数执行的时间,另外也引入我们的错误检测头文件error.cuh,.cu代码这么写:
#include <stdio.h>
#include <math.h>
#include "error.cuh"
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
int tmp = 0.0;
for (int h = 0; h < n; ++h)
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
int m=100;
int n=100;
int k=100;
int *h_a, *h_b, *h_c, *h_cc;
CHECK(cudaMallocHost((void **) &h_a, sizeof(int)*m*n));
CHECK(cudaMallocHost((void **) &h_b, sizeof(int)*n*k));
CHECK(cudaMallocHost((void **) &h_c, sizeof(int)*m*k));
CHECK(cudaMallocHost((void **) &h_cc, sizeof(int)*m*k));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
int *d_a, *d_b, *d_c;
CHECK(cudaMalloc((void **) &d_a, sizeof(int)*m*n));
CHECK(cudaMalloc((void **) &d_b, sizeof(int)*n*k));
CHECK(cudaMalloc((void **) &d_c, sizeof(int)*m*k));
// copy matrix A and B from host to device memory
CHECK(cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice));
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
CHECK(cudaEventRecord(start));
//cudaEventQuery(start);
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
CHECK(cudaMemcpy(h_c, d_c, (sizeof(int)*m*k), cudaMemcpyDeviceToHost));
//cudaThreadSynchronize();
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
int ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
if(fabs(h_cc[i*k + j] - h_c[i*k + j])>(1.0e-10))
{
ok = 0;
}
}
}
if(ok)
{
printf("Pass!!!\n");
}
else
{
printf("Error!!!\n");
}
// free memory
CHECK(cudaFree(d_a));
CHECK(cudaFree(d_b));
CHECK(cudaFree(d_c));
CHECK(cudaFreeHost(h_a));
CHECK(cudaFreeHost(h_b));
CHECK(cudaFreeHost(h_c));
CHECK(cudaFreeHost(h_cc));
return 0;
}
其中引用的error.cuh参考CUDA-Programming/error.cuh at master · brucefan1983/CUDA-Programming · GitHub
PS: 这里我发现了给的答案的一处小错误,h_cc没有在最后进行释放。我们需要进行`CHECK(cudaFreeHost(h_cc));``
Makefile这么写:
TEST_SOURCE = matrix_mul.cu
TARGETBIN := ./matrix_mul
CC = /usr/local/cuda/bin/nvcc
$(TARGETBIN):$(TEST_SOURCE)
$(CC) $(TEST_SOURCE) -o $(TARGETBIN) -I ./
.PHONY:clean
clean:
-rm -rf $(TARGETBIN)
这里由于我们引用的头文件error.cuh,所以编译命令里需要写-I选项,表明include路径包含本文件夹,确保能找到error.cuh。
PS: 这里由于是我自己在jupyter上手敲的内容,按tab会默认加四个空格而非是制表符,所以一直会报下面这个错,复制别的地方的tab过来以后就好了。
那么编译运行后,如果程序出错,回显是这样的:
正常的回显会打出来调用GPU计算矩阵乘法时的耗时:
真的是十分方便!
用shared memory加速矩阵乘法和bank conflict
当我们在处理矩阵乘法时,假设矩阵
M
(
m
,
k
)
∗
N
(
k
,
n
)
=
P
(
m
,
n
)
M(m,k)*N(k,n) = P(m,n)
M(m,k)∗N(k,n)=P(m,n)。那么,矩阵M中的一个数值m(x,y),就要被grid中所有满足
t
h
r
e
a
d
I
d
x
.
y
+
b
l
o
c
k
I
d
x
.
y
∗
b
l
o
c
k
D
i
m
.
y
=
y
threadIdx.y+blockIdx.y*blockDim.y = y
threadIdx.y+blockIdx.y∗blockDim.y=y的线程从Global Memory中读一次,一共就是K次。那么,我们看到这么多重复读取,就可以把这个变量放在Shared Memory中,极大地减少每次的读取时间。
我们采用分tile块的方式,将一部分子矩阵加载到SMem中,如图中的蓝色块和橙色块。
代码这么写:
#include <stdio.h>
#include <math.h>
#include "error.cuh"
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
__global__ void gpu_matrix_mult_shared(int *d_a, int *d_b, int *d_result, int n)
{
__shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int tmp = 0;
int idx;
for (int sub = 0; sub < gridDim.x; ++sub)
{
idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
tile_a[threadIdx.y][threadIdx.x] = row<n && (sub * BLOCK_SIZE + threadIdx.x)<n? d_a[idx]:0;
idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
tile_b[threadIdx.y][threadIdx.x] = col<n && (sub * BLOCK_SIZE + threadIdx.y)<n? d_b[idx]:0;
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k)
{
tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
}
__syncthreads();
}
if(row < n && col < n)
{
d_result[row * n + col] = tmp;
}
}
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
int tmp = 0.0;
for (int h = 0; h < n; ++h)
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
int m=1000;
int n=1000;
int k=1000;
int *h_a, *h_b, *h_c, *h_cc, *h_cs;
CHECK(cudaMallocHost((void **) &h_a, sizeof(int)*m*n));
CHECK(cudaMallocHost((void **) &h_b, sizeof(int)*n*k));
CHECK(cudaMallocHost((void **) &h_c, sizeof(int)*m*k));
CHECK(cudaMallocHost((void **) &h_cc, sizeof(int)*m*k));
CHECK(cudaMallocHost((void **) &h_cs, sizeof(int)*m*k));
cudaEvent_t start, stop,stop_share;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventCreate(&stop_share));
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = 1;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = 0;
}
}
int *d_a, *d_b, *d_c, *d_c_share;
CHECK(cudaMalloc((void **) &d_a, sizeof(int)*m*n));
CHECK(cudaMalloc((void **) &d_b, sizeof(int)*n*k));
CHECK(cudaMalloc((void **) &d_c, sizeof(int)*m*k));
CHECK(cudaMalloc((void **) &d_c_share, sizeof(int)*m*k));
CHECK(cudaEventRecord(start));
// copy matrix A and B from host to device memory
CHECK(cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice));
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m,n,k);
CHECK(cudaMemcpy(h_c, d_c, (sizeof(int)*m*k), cudaMemcpyDeviceToHost));
//cudaThreadSynchronize();
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
gpu_matrix_mult_shared<<<dimGrid, dimBlock>>>(d_a, d_b, d_c_share, n);
CHECK(cudaMemcpy(h_cs, d_c_share, (sizeof(int)*m*k), cudaMemcpyDeviceToHost));
CHECK(cudaEventRecord(stop_share));
CHECK(cudaEventSynchronize(stop_share));
float elapsed_time, elapsed_time_share;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
CHECK(cudaEventElapsedTime(&elapsed_time_share, stop, stop_share));
printf("Time_global = %g ms.\n", elapsed_time);
printf("Time_share = %g ms.\n", elapsed_time_share);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
cpu_matrix_mult(h_a, h_b, h_c, m, n, k);
int ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
if(fabs(h_cs[i*k + j] - h_c[i*k + j])>(1.0e-10))
{
printf("hcs: %d hc: %d ",h_cs[i*k + j], h_c[i*k + j]);
ok = 0;
}
}
}
if(ok)
{
printf("Pass!!!\n");
}
else
{
printf("Error!!!\n");
}
// free memory
CHECK(cudaFree(d_a));
CHECK(cudaFree(d_b));
CHECK(cudaFree(d_c));
CHECK(cudaFreeHost(h_a));
CHECK(cudaFreeHost(h_b));
CHECK(cudaFreeHost(h_c));
CHECK(cudaFreeHost(h_cc));
return 0;
}
这里规模改成了1000,能让加速效果看得更明显一些。
bank conflict
内核代码这样写,其他的不变
__global__ void gpu_matrix_mult_shared(int *d_a, int *d_b, int *d_result, int m, int n, int k)
{
__shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int tmp = 0;
int idx;
for (int sub = 0; sub < gridDim.x; ++sub)
{
// 没有 bank conflict
//idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
//tile_a[threadIdx.y][threadIdx.x] = row<n && (sub * BLOCK_SIZE + threadIdx.x)<n? d_a[idx]:0;
//idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
//tile_b[threadIdx.y][threadIdx.x] = col<n && (sub * BLOCK_SIZE + threadIdx.y)<n? d_b[idx]:0;
//生成 st bank conflict
idx = row * n + sub * BLOCK_SIZE + threadIdx.y;
tile_a[threadIdx.x][threadIdx.y] = row<n && (sub * BLOCK_SIZE + threadIdx.y)<n? d_a[idx]:0;
idx = (sub * BLOCK_SIZE + threadIdx.x) * n + col;
tile_b[threadIdx.x][threadIdx.y] = col<n && (sub * BLOCK_SIZE + threadIdx.x)<n? d_b[idx]:0;
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k)
{
tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
}
__syncthreads();
}
if(row < n && col < n)
{
d_result[row * n + col] = tmp;
}
}
使用命令查看bank conflict情况,引入的是–events shared_ld_bank_conflict,shared_st_bank_conflict参数
sudo /usr/local/cuda/bin/nvprof --events shared_ld_bank_conflict,shared_st_bank_conflict ./matrix_mul
结果如下所示:
没有bank conflict的长这样:
打卡题目和其他知识:
-
GPU上多个线程的原子操作的多个分解步骤物理上不可能并行这句话对么?
答:这句话不完全正确。GPU上多个线程可以同时进行原子操作,但是对于特定的存储单元,只有一个线程能够成功地完成原子操作。因此,多个线程同时对相同的存储单元执行原子操作时,其中的一些线程可能会遇到冲突,并且仅有一个线程的操作会被执行,其他的线程的操作将被阻塞直到它们能够完成操作为止。 -
原子操作所使用的存储单元,必须是shared memory吗?
答:不是必须是shared memory。原子操作可以在多种存储单元中进行,包括global memory和shared memory,具体取决于操作的需求和性能要求。 -
cudaMallocHost和使用malloc的区别
回答:驱动程序跟踪用这个函数分配的虚拟内存范围,并自动加速对cudaMemcpy()等函数的调用。由于内存可以被设备直接访问,因此可以用比用malloc()等函数获得的可翻页内存高得多的带宽来读取或写入。另外:1. 存储位置:cudaMallocHost 分配的内存位于 Host 内存,而 malloc 分配的内存位于操作系统管理的堆中。2.可访问性:cudaMallocHost 分配的内存可以被 Host 和 GPU 访问,而 malloc 分配的内存仅能被 Host 访问。3.性能:cudaMallocHost 可以提高数据传输的性能,因为它允许 GPU 和 Host 共享内存,减少了数据传输时间;而 malloc 分配的内存必须在 GPU 和 Host 之间传输,可能导致更长的数据传输时间。4.避免内存虚拟化技术从内存移到disk。总的来说,如果需要在 GPU 和主机之间共享内存,则应使用 cudaMallocHost。但如果仅需要在主机上分配内存,则可以使用 malloc。 -
注意cudaMemcpy已经包含了同步过程,就不需要显式调用sync了