CUDA任意矩阵相乘 TLP最终版

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <malloc.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include "device_launch_parameters.h"
//#include "common.h"
using namespace std;
#define BLOCK_SIZE_X 32
#define BLOCK_SIZE_Y 32
#define  AW 1539
#define  AH 700
#define  BW 800
#define  BH 1539


typedef struct
{
int width;
int height;
float *elements;
}Matrix; 
void generdata(Matrix data, int dataW, int dataH)
{
for (int y = 0; y < dataH; y++)
{
for (int x =0; x < dataW; x++)
{
data.elements[y*dataW+x] = (float) rand() / RAND_MAX;
}
}
}
void MatMuti_CPU(const Matrix A, const Matrix B, Matrix C)
{
for (int row = 0; row < C.height; row++)
{
for (int col = 0; col < C.width; col++)
{
double Cvalue = 0;


for (int k = 0; k < A.width; k++)
{
Cvalue += A.elements[row*A.width+k] * B.elements[k*B.width+col];
}
C.elements[row*C.width+col] = Cvalue;
}
}
}
__global__ void
MatMultlinative_Kernel(const Matrix A, const Matrix B, Matrix C)
{
const int col = blockIdx.x * blockDim.x + threadIdx.x;
const int row = blockIdx.y * blockDim.y + threadIdx.y;


if (col< C.width && row < C.height)
{
float Cvalue = 0.0f;
float eps = 0.0f;
for (int i = 0; i < A.width; i++)
{
float t;
//Cvalue += A.elements[row*A.width+i] * B.elements[i*B.width+col];
eps -= A.elements[row*A.width+i] * B.elements[i*B.width+col];
t = Cvalue - eps;
eps = (t - Cvalue) + eps;
Cvalue = t;
}


C.elements[row*C.width+col] = Cvalue;
}


}


__global__ void
MatMultiShare_Kernel(const Matrix A, const Matrix B, Matrix C)
{
const int Bx = blockIdx.x;
const int By = blockIdx.y;


const int tx = threadIdx.x;
const int ty = threadIdx.y;

int col = Bx * blockDim.x + tx;
int row = By * blockDim.y + ty;


__shared__ float Ashared[BLOCK_SIZE_Y][BLOCK_SIZE_X];
__shared__ float Bshared[BLOCK_SIZE_X][BLOCK_SIZE_Y];


float Cvalue = 0.0f;
float eps = 0.0f;
for (int k = 0; k < A.width; k +=blockDim.x)
{
if ((tx+k)<A.width && row < A.height)
{
Ashared[ty][tx] = A.elements[row*A.width+(tx+k)];
}
else
{
Ashared[ty][tx] = 0;
}
if ((ty+k)<B.height && col < B.width)
{
Bshared[ty][tx] = B.elements[(ty+k)*B.width+col];
}
else
{
Bshared[ty][tx] = 0;
}


__syncthreads();


for (int e = 0; e < blockDim.x;e++)
{
//Cvalue += Ashared[ty][e] * Bshared[e][tx];
float t;
eps -= Ashared[ty][e] * Bshared[e][tx];
t = Cvalue - eps;
eps = (t - Cvalue) + eps;
Cvalue = t;
}


__syncthreads();
}


if (col < C.width && row < C.height)
C.elements[row*C.width+col] = Cvalue;
}


__global__ void
MatMultiSharePitch_Kernel(const Matrix A, const Matrix B, Matrix C)
{
const int Bx = blockIdx.x;
const int By = blockIdx.y;


const int tx = threadIdx.x;
const int ty = threadIdx.y;


int col = Bx * blockDim.x + tx;
int row = By * blockDim.y + ty;

if (col >= C.width || row >= C.height)
return;


__shared__ float Ashared[BLOCK_SIZE_Y][BLOCK_SIZE_X];
__shared__ float Bshared[BLOCK_SIZE_X][BLOCK_SIZE_Y];


float Cvalue = 0.0f;
float eps = 0.0f;
for (int k = 0; k < A.width; k +=blockDim.x)
{


Ashared[ty][tx] = A.elements[row*A.width+(tx+k)];


Bshared[ty][tx] = B.elements[(ty+k)*B.width+col];


__syncthreads();


for (int e = 0; e < blockDim.x;e++)
{
//Cvalue += Ashared[ty][e] * Bshared[e][tx];
float t;
eps -= Ashared[ty][e] * Bshared[e][tx];
t = Cvalue - eps;
eps = (t - Cvalue) + eps;
Cvalue = t;
}


__syncthreads();
}


C.elements[row*C.width+col] = Cvalue;
}
int main()
{
// Load A and B to device memory
Matrix A;
A.width = AW;
A.height = AH;
size_t size = A.width * A.height * sizeof(float);
A.elements = (float *)malloc(size);
memset(A.elements, 0, size);
generdata(A, A.width, A.height);


Matrix B;
B.width = BW;
B.height = BH;
size = B.width * B.height * sizeof(float);
B.elements = (float *)malloc(size);
memset(B.elements, 0, size);
generdata(B, B.width, B.height);


Matrix d_A;
d_A.width = A.width; d_A.height = (A.height+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y*BLOCK_SIZE_Y;
size_t pitch_A;
cudaMallocPitch((void**)&d_A.elements, &pitch_A, sizeof(float)*A.width, d_A.height);
cudaMemset(d_A.elements, 0 , pitch_A*d_A.height);
cudaMemcpy2D(d_A.elements, pitch_A, A.elements, sizeof(float)*A.width, sizeof(float)*A.width, A.height,
cudaMemcpyHostToDevice);
// cudaMemcpy2D(d_A.elements, sizeof(float)*A.width, A.elements, sizeof(float)*A.width, sizeof(float)*A.width, A.height,
// cudaMemcpyHostToDevice);


Matrix d_B;
d_B.width = B.width; d_B.height = pitch_A/sizeof(float);
size_t pitch_B;
cudaMallocPitch((void**)&d_B.elements, &pitch_B, sizeof(float)*d_B.width, d_B.height);
cudaMemset(d_B.elements, 0 , pitch_B*d_B.height);
cudaMemcpy2D(d_B.elements, pitch_B, B.elements, sizeof(float)*B.width, sizeof(float)*B.width, B.height,
cudaMemcpyHostToDevice);
// cudaMemcpy2D(d_B.elements, sizeof(float)*B.width, B.elements, sizeof(float)*B.width,  sizeof(float)*B.width, B.height,
// cudaMemcpyHostToDevice);


Matrix C_CPU;
C_CPU.width = B.width; C_CPU.height = A.height;
size = C_CPU.width * C_CPU.height * sizeof(float);
C_CPU.elements = (float *)malloc(size);
memset(C_CPU.elements, 0, size);


Matrix C;
C.width = B.width; C.height = A.height;
size = C.width * C.height * sizeof(float);
C.elements = (float *)malloc(size);
memset(C.elements, 0, size);


// Allocate C in device memory
Matrix d_C;
d_C.width = d_B.width; d_C.height = d_A.height;
size_t pitch_C;
cudaMallocPitch((void**)&d_C.elements, &pitch_C, sizeof(float)*C.width, C.height);
// cudaMemcpy2D(d_C.elements, pitch_C, C.elements, pitch_C, sizeof(float)*C.width, C.height,
// cudaMemcpyHostToDevice);
cudaMemset(d_C.elements, 0, pitch_C*C.height);
//double times = 0;


cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);


// Invoke kernel
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
//dim3 dimGrid((C.width + dimBlock.x-1) / dimBlock.x, (C.height +dimBlock.y-1) / dimBlock.y);
dim3 dimGrid((C.width + dimBlock.x-1) / dimBlock.x, (C.height +dimBlock.y-1) / dimBlock.y);
d_A.width = pitch_A/sizeof(float);
d_B.width = pitch_B/sizeof(float);
d_C.width = pitch_C/sizeof(float);


//times = seconds();
cudaEventRecord(start,0);
MatMultiSharePitch_Kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
//MatMultiShare_Kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
//MatMultlinative_Kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float costtime=0;
cudaEventElapsedTime(&costtime,start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);


printf("eslp times %f ms\n",costtime);


// Read C from device memory
cudaMemcpy2D(C.elements, sizeof(float)*C_CPU.width, d_C.elements, pitch_C, sizeof(float)*C_CPU.width, C.height,
cudaMemcpyDeviceToHost);


// CPU comput
MatMuti_CPU(A, B, C_CPU);
float Maxerror = 0;
float avgerror = 0.0f;
for (int row = 0; row < C.height; row++)
{
for (int col = 0; col < C.width; col++)
{
float error = fabs(C.elements[row*C.width+col] - C_CPU.elements[row*C_CPU.width+col]) / C_CPU.elements[row*C_CPU.width+col];
if (Maxerror < error)
{
Maxerror = error;
}
avgerror += error;
}
}


printf("Max error: %g Average error: %g\n", Maxerror, avgerror / (C.height * C.width));  


free(A.elements);
free(B.elements);
free(C.elements);
free(C_CPU.elements);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
cudaDeviceReset();
getchar();
return 0;
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值