cuda矩阵相乘(优化)

原始

#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;
__global__ void jia(int a[2][2], int b[2][2], int c[2][2],int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    int sum=0;
    for (int i = 0; i < n; i++) {
        sum += a[idx][i] * b[i][idy];
    }
    c[idx][idy] = sum;
}
int main() {
	int ha[2][2],hb[2][2],hc[2][2];
    int (*da)[2], (*db)[2], (*dc)[2];
    for (int i = 0; i < 2; i++)

        for (int j = 0; j < 2; j++)

            ha[i][j] = 2;

    for (int i = 0; i < 2; i++)

        for (int j = 0; j < 2; j++)

            hb[i][j] = 5;
    cudaMalloc((void**)&da, sizeof(int) * 4);
    cudaMalloc((void**)&db, sizeof(int) * 4);
    cudaMalloc((void**)&dc, sizeof(int) * 4);
    cudaMemcpy(da, ha, sizeof(int) * 4, cudaMemcpyHostToDevice);
    cudaMemcpy(db, hb, sizeof(int) * 4, cudaMemcpyHostToDevice);
    dim3 Block(2, 2);
    jia << <1, Block >> > (da, db, dc,2);
    cudaMemcpy(hc, dc, sizeof(int) * 4, cudaMemcpyDeviceToHost);
    for (int i = 0; i < 2; i++) {
        for (int j = 0; j < 2; j++) {
            cout << hc[i][j] << "  ";
        }
        cout << endl;
    }
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
}

优化一

#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;

__global__ void jia(int a[2][2], int b[2][2], int c[2][2], int n) {
	__shared__ int da[2][2];
	__shared__ int db[2][2];
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			da[i][j] = a[i][j];
			db[i][j] = b[i][j];
		}
	}
	__syncthreads();
	int idx = threadIdx.x + blockIdx.x * blockDim.x;

	int idy = threadIdx.y + blockIdx.y * blockDim.y;
	int sum = 0;
	for (int i = 0; i < n; i++) {
		sum += a[idx][i] * b[i][idy];
	}
	c[idx][idy] = sum;
}

int main() {
	int ha[2][2], hb[2][2], hc[2][2];
	int(*da)[2], (*db)[2], (*dc)[2];
	for (int i = 0; i < 2; i++)

		for (int j = 0; j < 2; j++)

			ha[i][j] = 2;

	for (int i = 0; i < 2; i++)

		for (int j = 0; j < 2; j++)

			hb[i][j] = 5;
	cudaMalloc((void**)&da, sizeof(int) * 4);
	cudaMalloc((void**)&db, sizeof(int) * 4);
	cudaMalloc((void**)&dc, sizeof(int) * 4);
	cudaMemcpy(da, ha, sizeof(int) * 4, cudaMemcpyHostToDevice);
	cudaMemcpy(db, hb, sizeof(int) * 4, cudaMemcpyHostToDevice);
	dim3 Block(2, 2);
	jia << <1, Block >> > (da, db, dc, 2);
	cudaMemcpy(hc, dc, sizeof(int) * 4, cudaMemcpyDeviceToHost);
	for (int i = 0; i < 2; i++) {
		for (int j = 0; j < 2; j++) {
			cout << hc[i][j] << "  ";
		}
		cout << endl;
	}
	cudaFree(da);
	cudaFree(db);
	cudaFree(dc);
}

优化二

#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;

__global__ void jia(int a[16][16], int b[16][16], int c[16][16], int n) {
	__shared__ int da[16 * 16][16];
	int idx = threadIdx.x + blockIdx.x * blockDim.x;

	int idy = threadIdx.y + blockIdx.y * blockDim.y;
	for (int i = 0; i < n; i++) {
		da[idy * n + idx][i] = a[idx][i] * b[i][idy];
	}
	__syncthreads();
	for (int i = 0; i < n; i++) {
		c[idx][idy] += da[idy * n + idx][i];
	}
}


int main() {
	int ha[16][16], hb[16][16], hc[16][16];
	int(*da)[16], (*db)[16], (*dc)[16];
	for (int i = 0; i < 16; i++)

		for (int j = 0; j < 16; j++)

			ha[i][j] = 2;

	for (int i = 0; i < 16; i++)

		for (int j = 0; j < 16; j++)

			hb[i][j] = 5;
	cudaMalloc((void**)&da, sizeof(int) * 16 * 16);
	cudaMalloc((void**)&db, sizeof(int) * 16 * 16);
	cudaMalloc((void**)&dc, sizeof(int) * 16 * 16);
	cudaMemcpy(da, ha, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
	cudaMemcpy(db, hb, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
	dim3 Block(16, 16);
	jia << <1, Block >> > (da, db, dc, 16);
	cudaMemcpy(hc, dc, sizeof(int) * 16 * 16, cudaMemcpyDeviceToHost);
	for (int i = 0; i < 16; i++) {
		for (int j = 0; j < 16; j++) {
			cout << hc[i][j] << "  ";
		}
		cout << endl;
	}
	cudaFree(da);
	cudaFree(db);
	cudaFree(dc);
}

优化三

#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;

__global__ void jia(int a[16][16], int b[16][16], int c[16][16], int n) {
	__shared__ int da[16 * 16][16];
	int idx = threadIdx.x + blockIdx.x * blockDim.x;

	int idy = threadIdx.y + blockIdx.y * blockDim.y;
	for (int i = 0; i < n; i++) {
		da[idy * n + idx][i] = a[idx][i] * b[i][idy];
	}
	__syncthreads();
	for (int i = 0; i < n; i++) {
		c[idy * n + idx][i] = da[idy * n + idx][i];
	}
}

__global__ void gui(int a[16], int b[16]) {
	__shared__ int sdata[100];
	int idx = threadIdx.x;
	int x = blockIdx.x * blockDim.x + threadIdx.x * 2;
	sdata[idx] = a[x] + a[x + 1];
	__syncthreads();
	if (idx < 32) {
		sdata[idx] += sdata[idx + 8];
		sdata[idx] += sdata[idx + 4];
		sdata[idx] += sdata[idx + 2];
		sdata[idx] += sdata[idx + 1];
	}
	if (idx == 0)b[0] = sdata[0];
}

int main() {
	int ha[16][16], hb[16][16], hc[256][16], jieguo[16][16];
	int(*da)[16], (*db)[16], (*dc)[16];
	for (int i = 0; i < 16; i++)

		for (int j = 0; j < 16; j++)

			ha[i][j] = 2;

	for (int i = 0; i < 16; i++)

		for (int j = 0; j < 16; j++)

			hb[i][j] = 5;
	cudaMalloc((void**)&da, sizeof(int) * 16 * 16);
	cudaMalloc((void**)&db, sizeof(int) * 16 * 16);
	cudaMalloc((void**)&dc, sizeof(int) * 256 * 16);
	cudaMemcpy(da, ha, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
	cudaMemcpy(db, hb, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
	dim3 Block(16, 16);
	jia << <1, Block >> > (da, db, dc, 16);
	cudaMemcpy(hc, dc, sizeof(int) * 256 * 16, cudaMemcpyDeviceToHost);
	int  hd[16], he[16];
	int* dd, * de;
	cudaMalloc((void**)&dd, sizeof(int) * 16);
	cudaMalloc((void**)&de, sizeof(int) * 16);
	for (int i = 0; i < 16; i++) {
		for (int j = 0; j < 16; j++) {
			for (int k = 0; k < 16; k++) {
				hd[k] = hc[i * 16 + j][k];
			}
			cudaMemcpy(dd, hd, sizeof(int) * 16, cudaMemcpyHostToDevice);
			gui << <1, 16 >> > (dd, de);
			cudaMemcpy(he, de, sizeof(int) * 16, cudaMemcpyDeviceToHost);
			jieguo[i][j] = he[0];
		}
	}
	for (int i = 0; i < 16; i++) {
		for (int j = 0; j < 16; j++) {
			cout << jieguo[i][j] << "  ";
		}
		cout << endl;
	}
	cudaFree(da);
	cudaFree(db);
	cudaFree(dc);
}
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值