本人由于学习需要,最近一直在忙向量与矩阵的乘法,目前完成了一维索引和二维索引的代码,虽然还有很多不足,但是还是贴出来,一是希望可以记录一下目前阶段的进度,二来也是希望各位大神可以指点一二。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#define arraySize 128
#define threadsPerBlock 256
#define blocksPerGrid arraySize*(arraySize + threadsPerBlock - 1)/threadsPerBlock
#define TILE_WIDTH 8
__global__ void VecMulMat(const int *a, const int *b, int *c)//一维索引,如果矩阵过大,程序无法正确运行,待修改
{
__shared__ int temp_c[threadsPerBlock];
int temp = 0;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int row = idx / arraySize;
int column = idx % arraySize;
if ((row < arraySize) && (column < arraySize)){
temp_c[idx] = a[row] * b[row * arraySize + column];
}
__syncthreads();
if (column < arraySize && idx < arraySize * arraySize){
atomicAdd(&c[column], temp_c[idx]);
}
}
__global__ void VecMulMat_2D(const int *a, const int *b, int *c){//二维索引,代码逻辑还是不太懂
__shared__ int sharedA[TILE_WIDTH];
__shared__ int sharedB[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
int v = 0;
for (int i = 0; i < (arraySize + TILE_WIDTH - 1) / TILE_WIDTH; i++){
if (i * TILE_WIDTH + ty < arraySize && col < arraySize){
sharedB[ty][tx] = b[(i*TILE_WIDTH + ty)*arraySize + col];
}
else sharedB[ty][tx] = 0;
if (i*TILE_WIDTH + tx < arraySize){
sharedA[tx] = a[i*TILE_WIDTH + tx];
}
else sharedA[tx] = 0;
__syncthreads();
for (int j = 0; j < TILE_WIDTH; j++){
v += sharedA[j] * sharedB[j][tx];
}
__syncthreads();
}
if (col < arraySize){
c[col] = v;
}
}
int main()
{
int a[arraySize];
int b[arraySize * arraySize];
int c[arraySize];
int twoD_C[arraySize];
int Dev_c[arraySize];
int *dev_a, *dev_b, *dev_c, *two_c;
for (int i = 0; i < arraySize; i++){
a[i] = i;
}
for (int i = 0; i < arraySize; i++){
for (int j = 0; j < arraySize; j++){
b[i * arraySize + j] = i * arraySize + j;
}
}
for (int i = 0; i < arraySize; i++){
c[i] = 0;
}
cudaMalloc((void**)&dev_a, arraySize * sizeof(int));
cudaMalloc((void**)&dev_b, arraySize * arraySize * sizeof(int));
cudaMalloc((void**)&dev_c, arraySize * sizeof(int));
cudaMemset(dev_c, 0, arraySize * sizeof(int));
cudaMalloc((void**)&two_c, arraySize * sizeof(int));
cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, arraySize * arraySize * sizeof(int), cudaMemcpyHostToDevice);
dim3 DimGrid((arraySize + TILE_WIDTH - 1) / TILE_WIDTH, (arraySize + TILE_WIDTH - 1) / TILE_WIDTH, 1);
dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1);
float time_elapsed = 0;
int nIter = 300;
cudaEvent_t start_kernel, end_kernel;
cudaEventCreate(&start_kernel);
cudaEventCreate(&end_kernel);
cudaEventRecord(start_kernel, 0);
VecMulMat_2D << <DimGrid, DimBlock >> >(dev_a, dev_b, two_c);
cudaMemcpy(twoD_C, two_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(end_kernel, 0);
cudaEventSynchronize(end_kernel);
cudaEventElapsedTime(&time_elapsed, start_kernel, end_kernel);
time_elapsed /= nIter;
/*
for (int i = 0; i < nIter; i++){
VecMulMat << <blocksPerGrid, threadsPerBlock>> >(dev_a, dev_b, dev_c);
}
*/
//cudaMemcpy(Dev_c, dev_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < arraySize; i++){
for (int j = 0; j < arraySize; j++){
c[i * arraySize + j] = a[i] * b[i * arraySize + j];
}
}
for (int i = 1; i < arraySize; i++){
for (int j = 0; j < arraySize; j++){
c[j] += c[i * arraySize + j];
}
}
/*
for (int i = 1; i < arraySize; i++){
for (int j = 0; j < arraySize; j++){
Dev_c[j] += Dev_c[i * arraySize + j];
}
}
for (int i = 0; i < arraySize; i++){
printf("c[%d] = %d\n", i, c[i]);
printf("Dev_c[%d] = %d\n", i, Dev_c[i]);
}*/
int mul_err = 1;
for (int i = 0; i < arraySize; i++){
if (c[i] - twoD_C[i] != 0){
mul_err = 0;
break;
}
}
if (mul_err) printf("success!\n");
else printf("failure!\n");
printf("kernel time: %lf(ms)\n", time_elapsed);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
cudaFree(two_c);
system("pause");
return 0;
}