CUDA矩阵乘法
1 #include
2 #include
3 #include
4 #include
5 #include
6
7 #define BLOCK_SIZE 32
8 void matgen(float* a, int lda, int n)
9 {
10 int i, j;
11 for (i = 0; i < n; i++) {
12 for (j = 0; j < n; j++) {
13 a[i * lda + j] = (float)rand() / RAND_MAX +
14 (float)rand() / (RAND_MAX * RAND_MAX);
15 }
16 }
17 }
18 clock_t matmultCPU(const float* a, int lda, const float* b, int ldb,
19 float* c, int ldc, int n)
20 {
21 int i, j, k;
22 clock_t start, end;
23 start = clock();
24 for (i = 0; i < n; i++) {
25 for (j = 0; j < n; j++) {
26 double t = 0;
27 for (k = 0; k < n; k++) {
28 t += a[i * lda + k] * b[k * ldb + j];
29 }
30 c[i * ldc + j] = t;
31 }
32 }
33 end = clock();
34 return end - start;
35 }
36 void compare_mat(const float* a, int lda,
37 const float* b, int ldb, int n)
38 {
39 float max_err = 0;
40 float average_err = 0;
41 int i, j;
42 for (i = 0; i < n; i++) {
43 for (j = 0; j < n; j++) {
44 if (b[i * ldb + j] != 0) {
45 float err = fabs((a[i * lda + j] -
46 b[i * ldb + j]) / b[i * ldb + j]);
47 if (max_err < err) max_err = err;
48 average_err += err;
49 }
50 }
51 }
52 printf("Max error: %g Average error: %g\n",
53 max_err, average_err / (n * n));
54 }
55 __global__ static void matMultCUDA(const float* a, size_t lda,
56 const float* b, size_t ldb, float* c, size_t ldc, int n)
57 {
58 __shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
59 __shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
60 const int tidc = threadIdx.x;
61 const int tidr = threadIdx.y;
62 const int bidc = blockIdx.x * BLOCK_SIZE;
63 const int bidr = blockIdx.y * BLOCK_SIZE;
64 int i, j;
65 float results = 0;
66 float comp = 0;
67 for (j = 0; j < n; j += BLOCK_SIZE) {
68 if (tidr + bidr < n && tidc + j < n) {
69 matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
70 }
71 else {
72 matA[tidr][tidc] = 0;
73 }
74 if (tidr + j < n && tidc + bidc < n) {
75 matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
76 }
77 else {
78 matB[tidr][tidc] = 0;
79 }
80 __syncthreads();
81 for (i = 0; i < BLOCK_SIZE; i++) {
82 float t;
83 comp -= matA[tidr][i] * matB[i][tidc];
84 t = results - comp;
85 comp = (t - results) + comp;
86 results = t;
87 }
88 __syncthreads();
89 }
90 if (tidr + bidr < n && tidc + bidc < n) {
91 c[(tidr + bidr) * ldc + tidc + bidc] = results;
92 }
93 }
94 clock_t matmultCUDA(const float* a, int lda,
95 const float* b, int ldb, float* c, int ldc, int n)
96 {
97 float *ac, *bc, *cc;
98 size_t pitch_a, pitch_b, pitch_c;
99 clock_t start, end;
100 start = clock();
101 cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
102 cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
103 cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);
104
105 cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda,
106 sizeof(float)* n, n, cudaMemcpyHostToDevice);
107 cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb,
108 sizeof(float)* n, n, cudaMemcpyHostToDevice);
109
110 int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
111 dim3 blocks(bx, bx);
112 dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
113 matMultCUDA <<>>(ac, pitch_a / sizeof(float),
114 bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
115
116 cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c,
117 sizeof(float)* n, n, cudaMemcpyDeviceToHost);
118 cudaFree(ac);
119 cudaFree(bc);
120 cudaFree(cc);
121 end = clock();
122 return end - start;
123 }
124 int main()
125 {
126 float *a, *b, *c, *d;
127 int n = 1024;
128 a = (float*)malloc(sizeof(float)* n * n);
129 b = (float*)malloc(sizeof(float)* n * n);
130 c = (float*)malloc(sizeof(float)* n * n);
131 d = (float*)malloc(sizeof(float)* n * n);
132 srand(0);
133 matgen(a, n, n);
134 matgen(b, n, n);
135 clock_t cpu_time = matmultCPU(a, n, b, n, d, n, n);
136 clock_t gpu_time = matmultCUDA(a, n, b, n, c, n, n);
137 compare_mat(c, n, d, n, n);
138 printf("CPU time used: %dms \n", cpu_time);
139 printf("GPU time used: %dms \n", gpu_time);
140
141 system("pause");
142 return 0;
143 }
参考:《深入浅出谈CUDA》
标签:int,float,矩阵,CUDA,pitch,sizeof,SIZE,BLOCK,乘法
来源: https://www.cnblogs.com/ww1x/p/10856452.html