CUDA编程--实现并行矩阵乘法【80行代码】_cuda并行编程乘法表-CSDN博客

本文链接：https://blog.csdn.net/a19990412/article/details/85221919

简述

这里只写了方阵之间的乘法，但是本质上都是一样的。

我测试过100规模的方阵之间的乘法，没有问题。

代码

读取文件data.txt
数据格式就是一个数值N，然后来连续的两个N*N的矩阵。用空格隔开。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <stdio.h>
// Kernal:
__global__ void MatrixMultiply(int *a, int * b, int *c, int N) {
	int tx = threadIdx.x + blockIdx.x * blockDim.x;
	int ty = threadIdx.y + blockIdx.y * blockDim.y;
	if (tx < N && ty < N) {
		int sum = 0;
		for (int k = 0; k < N; ++k) {
			int adata = a[tx * N + k];
			int bdata = b[k * N + ty];
			sum += adata * bdata;
		}
		c[tx * N + ty] = sum;
	}
}

cudaError_t matrixMultiplyWithCuda(int *a, int *b, int *c, size_t size);

int main()
{
	std::ifstream in("data.txt");
	int N;
	in >> N;
	if (in.fail()) {
		printf("Something wrong\n");
	}
	else {
		printf("Success read\n");
	}
	// host initial
	int *a = new int[N * N];
	int *b = new int[N * N];
	int *c = new int[N * N];

	// read 
	for (int i = 0; i < N; ++i)
		for (int j = 0; j < N; ++j) in >> a[i * N + j];

	for (int i = 0; i < N; ++i)
		for (int j = 0; j < N; ++j) in >> b[i * N + j];

	cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);

	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < N; ++j) std::cout << c[i * N + j]<<" ";
		std::cout << std::endl;
	}
	cudaStatus = cudaThreadExit();

	// host free 
	delete[] a;
	delete[] b;
	delete[] c;
	return 0;
}
cudaError_t matrixMultiplyWithCuda(int *a, int *b, int *c, size_t N) {
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;
	cudaError_t cudaStatus;
	cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
	cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
	cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
	cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		printf("Something wrong\n");
		goto Error;
	}
	// kernal invocation 
	dim3 threadPerBlock(32, 32);
	dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
	MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
	if (cudaStatus != cudaSuccess) {
		printf( "Calculate wrong\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);
Error:
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
	return cudaStatus;
}

写入文件的版本

（也改成了浮点数运算了）

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <fstream>
#include <stdio.h>
// Kernal:
__global__ void MatrixMultiply(float *a, float * b, float *c, int N) {
	int tx = threadIdx.x + blockIdx.x * blockDim.x;
	int ty = threadIdx.y + blockIdx.y * blockDim.y;
	if (tx < N && ty < N) {
		float sum = 0;
		for (int k = 0; k < N; ++k) {
			float adata = a[tx * N + k];
			float bdata = b[k * N + ty];
			sum += adata * bdata;
		}
		c[tx * N + ty] = sum;
	}
}

cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t size);

int main()
{
	std::ifstream in("data.txt");
	int N;
	in >> N;
	if (in.fail()) {
		printf("Something wrong\n");
	}
	else {
		printf("Success read\n");
	}
	// host initial
	float *a = new float[N * N];
	float *b = new float[N * N];
	float *c = new float[N * N];

	// read 
	for (int i = 0; i < N; ++i)
		for (int j = 0; j < N; ++j) in >> a[i * N + j];

	for (int i = 0; i < N; ++i)
		for (int j = 0; j < N; ++j) in >> b[i * N + j];

	cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);

	std::ofstream out("output.txt");
	for (int i = 0; i < N; ++i) {
		for (int j = 0; j < N; ++j) out << c[i * N + j]<<" ";
		out << std::endl;
	}
	cudaStatus = cudaThreadExit();

	// host free 
	delete[] a;
	delete[] b;
	delete[] c;
	return 0;
}
cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t N) {
	float *dev_a = 0;
	float *dev_b = 0;
	float *dev_c = 0;
	cudaError_t cudaStatus;
	cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
	cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
	cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
	cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		printf("Something wrong\n");
		goto Error;
	}
	// kernal invocation 
	dim3 threadPerBlock(32, 32);
	dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
	MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
	if (cudaStatus != cudaSuccess) {
		printf( "Calculate wrong\n");
		goto Error;
	}
	cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);
Error:
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
	return cudaStatus;
}