原始
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;
__global__ void jia(int a[2][2], int b[2][2], int c[2][2],int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
int sum=0;
for (int i = 0; i < n; i++) {
sum += a[idx][i] * b[i][idy];
}
c[idx][idy] = sum;
}
int main() {
int ha[2][2],hb[2][2],hc[2][2];
int (*da)[2], (*db)[2], (*dc)[2];
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
ha[i][j] = 2;
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
hb[i][j] = 5;
cudaMalloc((void**)&da, sizeof(int) * 4);
cudaMalloc((void**)&db, sizeof(int) * 4);
cudaMalloc((void**)&dc, sizeof(int) * 4);
cudaMemcpy(da, ha, sizeof(int) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, sizeof(int) * 4, cudaMemcpyHostToDevice);
dim3 Block(2, 2);
jia << <1, Block >> > (da, db, dc,2);
cudaMemcpy(hc, dc, sizeof(int) * 4, cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
cout << hc[i][j] << " ";
}
cout << endl;
}
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}
优化一
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;
__global__ void jia(int a[2][2], int b[2][2], int c[2][2], int n) {
__shared__ int da[2][2];
__shared__ int db[2][2];
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
da[i][j] = a[i][j];
db[i][j] = b[i][j];
}
}
__syncthreads();
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
int sum = 0;
for (int i = 0; i < n; i++) {
sum += a[idx][i] * b[i][idy];
}
c[idx][idy] = sum;
}
int main() {
int ha[2][2], hb[2][2], hc[2][2];
int(*da)[2], (*db)[2], (*dc)[2];
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
ha[i][j] = 2;
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
hb[i][j] = 5;
cudaMalloc((void**)&da, sizeof(int) * 4);
cudaMalloc((void**)&db, sizeof(int) * 4);
cudaMalloc((void**)&dc, sizeof(int) * 4);
cudaMemcpy(da, ha, sizeof(int) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, sizeof(int) * 4, cudaMemcpyHostToDevice);
dim3 Block(2, 2);
jia << <1, Block >> > (da, db, dc, 2);
cudaMemcpy(hc, dc, sizeof(int) * 4, cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
cout << hc[i][j] << " ";
}
cout << endl;
}
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}
优化二
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;
__global__ void jia(int a[16][16], int b[16][16], int c[16][16], int n) {
__shared__ int da[16 * 16][16];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
for (int i = 0; i < n; i++) {
da[idy * n + idx][i] = a[idx][i] * b[i][idy];
}
__syncthreads();
for (int i = 0; i < n; i++) {
c[idx][idy] += da[idy * n + idx][i];
}
}
int main() {
int ha[16][16], hb[16][16], hc[16][16];
int(*da)[16], (*db)[16], (*dc)[16];
for (int i = 0; i < 16; i++)
for (int j = 0; j < 16; j++)
ha[i][j] = 2;
for (int i = 0; i < 16; i++)
for (int j = 0; j < 16; j++)
hb[i][j] = 5;
cudaMalloc((void**)&da, sizeof(int) * 16 * 16);
cudaMalloc((void**)&db, sizeof(int) * 16 * 16);
cudaMalloc((void**)&dc, sizeof(int) * 16 * 16);
cudaMemcpy(da, ha, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
dim3 Block(16, 16);
jia << <1, Block >> > (da, db, dc, 16);
cudaMemcpy(hc, dc, sizeof(int) * 16 * 16, cudaMemcpyDeviceToHost);
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
cout << hc[i][j] << " ";
}
cout << endl;
}
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}
优化三
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
using namespace std;
__global__ void jia(int a[16][16], int b[16][16], int c[16][16], int n) {
__shared__ int da[16 * 16][16];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
for (int i = 0; i < n; i++) {
da[idy * n + idx][i] = a[idx][i] * b[i][idy];
}
__syncthreads();
for (int i = 0; i < n; i++) {
c[idy * n + idx][i] = da[idy * n + idx][i];
}
}
__global__ void gui(int a[16], int b[16]) {
__shared__ int sdata[100];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x * 2;
sdata[idx] = a[x] + a[x + 1];
__syncthreads();
if (idx < 32) {
sdata[idx] += sdata[idx + 8];
sdata[idx] += sdata[idx + 4];
sdata[idx] += sdata[idx + 2];
sdata[idx] += sdata[idx + 1];
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[16][16], hb[16][16], hc[256][16], jieguo[16][16];
int(*da)[16], (*db)[16], (*dc)[16];
for (int i = 0; i < 16; i++)
for (int j = 0; j < 16; j++)
ha[i][j] = 2;
for (int i = 0; i < 16; i++)
for (int j = 0; j < 16; j++)
hb[i][j] = 5;
cudaMalloc((void**)&da, sizeof(int) * 16 * 16);
cudaMalloc((void**)&db, sizeof(int) * 16 * 16);
cudaMalloc((void**)&dc, sizeof(int) * 256 * 16);
cudaMemcpy(da, ha, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, sizeof(int) * 16 * 16, cudaMemcpyHostToDevice);
dim3 Block(16, 16);
jia << <1, Block >> > (da, db, dc, 16);
cudaMemcpy(hc, dc, sizeof(int) * 256 * 16, cudaMemcpyDeviceToHost);
int hd[16], he[16];
int* dd, * de;
cudaMalloc((void**)&dd, sizeof(int) * 16);
cudaMalloc((void**)&de, sizeof(int) * 16);
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
for (int k = 0; k < 16; k++) {
hd[k] = hc[i * 16 + j][k];
}
cudaMemcpy(dd, hd, sizeof(int) * 16, cudaMemcpyHostToDevice);
gui << <1, 16 >> > (dd, de);
cudaMemcpy(he, de, sizeof(int) * 16, cudaMemcpyDeviceToHost);
jieguo[i][j] = he[0];
}
}
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
cout << jieguo[i][j] << " ";
}
cout << endl;
}
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}