cuda矩阵相乘_CUDA矩阵乘法

CUDA矩阵乘法

1 #include

2 #include

3 #include

4 #include

5 #include

6

7 #define BLOCK_SIZE 32

8 void matgen(float* a, int lda, int n)

9 {

10 int i, j;

11 for (i = 0; i < n; i++) {

12 for (j = 0; j < n; j++) {

13 a[i * lda + j] = (float)rand() / RAND_MAX +

14 (float)rand() / (RAND_MAX * RAND_MAX);

15 }

16 }

17 }

18 clock_t matmultCPU(const float* a, int lda, const float* b, int ldb,

19 float* c, int ldc, int n)

20 {

21 int i, j, k;

22 clock_t start, end;

23 start = clock();

24 for (i = 0; i < n; i++) {

25 for (j = 0; j < n; j++) {

26 double t = 0;

27 for (k = 0; k < n; k++) {

28 t += a[i * lda + k] * b[k * ldb + j];

29 }

30 c[i * ldc + j] = t;

31 }

32 }

33 end = clock();

34 return end - start;

35 }

36 void compare_mat(const float* a, int lda,

37 const float* b, int ldb, int n)

38 {

39 float max_err = 0;

40 float average_err = 0;

41 int i, j;

42 for (i = 0; i < n; i++) {

43 for (j = 0; j < n; j++) {

44 if (b[i * ldb + j] != 0) {

45 float err = fabs((a[i * lda + j] -

46 b[i * ldb + j]) / b[i * ldb + j]);

47 if (max_err < err) max_err = err;

48 average_err += err;

49 }

50 }

51 }

52 printf("Max error: %g Average error: %g\n",

53 max_err, average_err / (n * n));

54 }

55 __global__ static void matMultCUDA(const float* a, size_t lda,

56 const float* b, size_t ldb, float* c, size_t ldc, int n)

57 {

58 __shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];

59 __shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];

60 const int tidc = threadIdx.x;

61 const int tidr = threadIdx.y;

62 const int bidc = blockIdx.x * BLOCK_SIZE;

63 const int bidr = blockIdx.y * BLOCK_SIZE;

64 int i, j;

65 float results = 0;

66 float comp = 0;

67 for (j = 0; j < n; j += BLOCK_SIZE) {

68 if (tidr + bidr < n && tidc + j < n) {

69 matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];

70 }

71 else {

72 matA[tidr][tidc] = 0;

73 }

74 if (tidr + j < n && tidc + bidc < n) {

75 matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];

76 }

77 else {

78 matB[tidr][tidc] = 0;

79 }

80 __syncthreads();

81 for (i = 0; i < BLOCK_SIZE; i++) {

82 float t;

83 comp -= matA[tidr][i] * matB[i][tidc];

84 t = results - comp;

85 comp = (t - results) + comp;

86 results = t;

87 }

88 __syncthreads();

89 }

90 if (tidr + bidr < n && tidc + bidc < n) {

91 c[(tidr + bidr) * ldc + tidc + bidc] = results;

92 }

93 }

94 clock_t matmultCUDA(const float* a, int lda,

95 const float* b, int ldb, float* c, int ldc, int n)

96 {

97 float *ac, *bc, *cc;

98 size_t pitch_a, pitch_b, pitch_c;

99 clock_t start, end;

100 start = clock();

101 cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);

102 cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);

103 cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

104

105 cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda,

106 sizeof(float)* n, n, cudaMemcpyHostToDevice);

107 cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb,

108 sizeof(float)* n, n, cudaMemcpyHostToDevice);

109

110 int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

111 dim3 blocks(bx, bx);

112 dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

113 matMultCUDA <<>>(ac, pitch_a / sizeof(float),

114 bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

115

116 cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c,

117 sizeof(float)* n, n, cudaMemcpyDeviceToHost);

118 cudaFree(ac);

119 cudaFree(bc);

120 cudaFree(cc);

121 end = clock();

122 return end - start;

123 }

124 int main()

125 {

126 float *a, *b, *c, *d;

127 int n = 1024;

128 a = (float*)malloc(sizeof(float)* n * n);

129 b = (float*)malloc(sizeof(float)* n * n);

130 c = (float*)malloc(sizeof(float)* n * n);

131 d = (float*)malloc(sizeof(float)* n * n);

132 srand(0);

133 matgen(a, n, n);

134 matgen(b, n, n);

135 clock_t cpu_time = matmultCPU(a, n, b, n, d, n, n);

136 clock_t gpu_time = matmultCUDA(a, n, b, n, c, n, n);

137 compare_mat(c, n, d, n, n);

138 printf("CPU time used: %dms \n", cpu_time);

139 printf("GPU time used: %dms \n", gpu_time);

140

141 system("pause");

142 return 0;

143 }

参考:《深入浅出谈CUDA》

标签:int,float,矩阵,CUDA,pitch,sizeof,SIZE,BLOCK,乘法

来源: https://www.cnblogs.com/ww1x/p/10856452.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值