#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>
#include "freshman.h"
//有点蒙蔽为什么是while难道if不行吗
//实现矩阵乘法(10,32)(32,25600)
//使用二位线程块和共享内存的方式加速计算
__global__ void MatMulKernel1D(float* C, float* A, float* B, const int wh, const int wC, const int hC)
{
const int totalSize = wC * hC;
//索引计算出了问题
int ny = threadIdx.y + blockDim.y * blockIdx.y;
int nx = threadIdx.x + blockIdx.x * blockDim.x;
int thID = ny * hC + nx;
if (thID < totalSize) {
int Cx = thID / wC; //数据坐标 与 thread索引的映射
int Cy = thID % wC;
float rst = 0.0;
for (int i = 0; i < wh; i++) {
rst += A[Cx * wh + i] * B[i * wC + Cy];
}
C[Cx * wC + Cy] = rst;
//thID += gridDim.x * blockDim.x;
}
}
void matrixMulCPU(float* C, const float* A, const float* B, unsigned int wA,
unsigned int wC, unsigned int hC) {
unsigned int hA = hC;
unsigned int wB = wC;
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j) {
float sum = 0;
for (unsigned int k = 0; k < wA; ++k) {
sum += A[i * wA + k] * B[k * wB + j];
}
C[i * wB + j] = (float)sum;
}
}
int main() {
//set up device
int dev = 0;
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop, dev));
printf("Using Device : %d: %s \n", dev, deviceprop.name);
CHECK(cudaSetDevice(dev));
//set up data size of matrix
int hc = 10;
int wh = 32;
int wc = 25600;
int nA = hc*wh;
int nAbytes = nA * sizeof(float);
printf("A Matrix size : h %d w %d \n", hc, wh);
int nB = wh * wc;
int nBbytes = nB * sizeof(float);
printf("B Matrix size : h %d w %d \n", wh, wc);
int nC = hc * wc;
int nCbytes = nC * sizeof(float);
printf("B Matrix size : h %d w %d \n", hc, wc);
//malloc host memory
float* a_h, * b_h, * res_h, *res_cpu;
a_h = (float*)malloc(nAbytes);
b_h = (float*)malloc(nBbytes);
res_h = (float*)malloc(nCbytes);
res_cpu = (float*)malloc(nCbytes);
//initialize data at host device
initialData(a_h, nA);
initialData(b_h, nB);
printf("initialize data complete! %f, %f", a_h[0], b_h[0]);
//set 0 at host device for res
memset(res_h, 0, nCbytes);
memset(res_cpu, 0, nCbytes);
printf("set 0 at host device complete!");
//add matrix at host device
double istart = cpuSecond();
matrixMulCPU(res_cpu,a_h, b_h , wh, wc, hc);
double iElaps = cpuSecond() - istart;
printf("CPU Exection time : %f \n", iElaps);
//malloc device memory
float* a_d, * b_d, * c_d;
cudaMalloc((void**)&(a_d), nAbytes);
cudaMalloc((void**)&(b_d), nBbytes);
cudaMalloc((void**)&(c_d), nCbytes);
//transfer data from host to device
cudaMemcpy(a_d, a_h, nAbytes, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, nBbytes, cudaMemcpyHostToDevice);
//ivok kernel at host device
int dimx = 10;
int dimy = 64;
dim3 block(dimx, dimy);
dim3 grid((hc + block.x - 1) / block.x, (wc + block.y - 1) / block.y);
istart = cpuSecond();
MatMulKernel1D <<<grid, block >>> (c_d, a_d, b_d, wh, wc, hc);
cudaDeviceSynchronize();
iElaps = cpuSecond() - istart;
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, iElaps);
cudaMemcpy(res_h, c_d, nCbytes, cudaMemcpyDeviceToHost);
//check if cpures and gpures is same
checkResult(res_h, res_cpu, nC);
//free device memory
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
//free host memory
free(a_h);
free(b_h);
free(res_h);
//reset device
cudaDeviceReset();
return 0;
}
cuda计算矩阵的乘法
最新推荐文章于 2024-04-29 13:34:21 发布