CUDA矩阵乘

2 篇文章 0 订阅
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"

#include <iostream>
using namespace std;

#define numElements 256

//windows下的计时;
#include <windows.h>
double get_time() {
	LARGE_INTEGER timer;
	static LARGE_INTEGER fre;
	static int init = 0;
	double t;
	if (init != 1) {
		QueryPerformanceFrequency(&fre);
		init = 1;
	}
	QueryPerformanceCounter(&timer);
	t = timer.QuadPart * 1. / fre.QuadPart;
	return t;
}

void Host_MarticMultiply(int A[][numElements], int B[][numElements], int C[][numElements]) {
	for (int i = 0; i < numElements; i++) {
		for (int j = 0; j < numElements; j++) {
			for (int k = 0; k < numElements; k++) {
				C[i][j] += A[i][k] * B[k][j];
			}
		}
	}
}

__global__ void MatixMultiply(int h_a[][numElements], int h_b[][numElements], int h_c[][numElements])
{
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int j = threadIdx.y + blockIdx.y * blockDim.y;

	if (i < numElements && j < numElements) {
		for (int k = 0; k < numElements; k++) {
			h_c[i][j] += h_a[i][k] * h_b[k][j];
		}
	}
}

void textValue(int h_c[][numElements]) {

	cout << "测试程序:测试三组数据" << endl;
	for (int i = 0; i < 5; i++)
	{
		for (int j = 0; j < 5; j++)
			cout << h_c[i][j] << "\t";
		cout << endl;
	}
	cout << endl;
	for (int i = 100; i < 105; i++)
	{
		for (int j = 0; j < 5; j++)
			cout << h_c[i][j] << "\t";
		cout << endl;
	}
	cout << endl;
	for (int i = 250; i < 255; i++)
	{
		for (int j = 0; j < 5; j++)
			cout << h_c[i][j] << "\t";
		cout << endl;
	}
}

int main()
{
	int(*h_a)[numElements] = new int[numElements][numElements];
	int(*h_b)[numElements] = new int[numElements][numElements];
	int(*h_c)[numElements] = new int[numElements][numElements];
	int(*h_d)[numElements] = new int[numElements][numElements];

	for (int i = 0; i < numElements; i++) {
		for (int j = 0; j < numElements; j++) {
			h_a[i][j] = i + 1;
			h_b[i][j] = i + 1;
			h_c[i][j] = 0;
			h_d[i][j] = 0;
		}
	}

	cudaError_t error = cudaSuccess;

	int(*d_a)[numElements], (*d_b)[numElements], (*d_c)[numElements];

	error = cudaMalloc((void**)&d_a, sizeof(int) * numElements * numElements);
	error = cudaMalloc((void**)&d_b, sizeof(int) * numElements * numElements);
	error = cudaMalloc((void**)&d_c, sizeof(int) * numElements * numElements);

	cudaMemcpy(d_a, h_a, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
	cudaMemcpy(d_c, h_c, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);

	double td = 0;
	double th = 0;
	//GPU执行时间
	cudaThreadSynchronize();//锁定直到前面进程执行结束
	td = get_time();

	dim3 blocks(32, 32);
	dim3 threads(32, 32);
	MatixMultiply << <blocks, threads >> > (d_a, d_b, d_c);

	cudaThreadSynchronize();
	td = get_time() - td;
	cout << "GPU执行时间:" << td << endl;//得到GPU执行最终时间

	//copy回主存
	cudaMemcpy(h_c, d_c, sizeof(int) * numElements * numElements, cudaMemcpyDeviceToHost);

	//CPU执行时间
	cudaThreadSynchronize();//锁定直到前面进程执行结束
	th = get_time();
	Host_MarticMultiply(h_a,h_b,h_d);
	cudaThreadSynchronize();
	th = get_time() - th;
	cout << "CPU执行时间:" << th << endl;//得到CPU执行最终时间

	cout << "GPU:" << endl;
	textValue(h_c);
	cout << endl;
	cout << "CPU:" << endl;
	textValue(h_d);
	cout << endl;

	free(h_a);
	free(h_b);
	free(h_c);
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,我可以回答这个问题。以下是一个使用 CUDA 语言编写的矩阵乘法示例: ```c #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #define N 1024 __global__ void matrixMul(float *a, float *b, float *c, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; if (i < n && j < n) { float sum = ; for (int k = ; k < n; k++) { sum += a[i * n + k] * b[k * n + j]; } c[i * n + j] = sum; } } int main() { float *a, *b, *c; float *d_a, *d_b, *d_c; int size = N * N * sizeof(float); // Allocate memory on host a = (float*)malloc(size); b = (float*)malloc(size); c = (float*)malloc(size); // Initialize matrices for (int i = ; i < N; i++) { for (int j = ; j < N; j++) { a[i * N + j] = i + j; b[i * N + j] = i - j; } } // Allocate memory on device cudaMalloc((void**)&d_a, size); cudaMalloc((void**)&d_b, size); cudaMalloc((void**)&d_c, size); // Copy matrices from host to device cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); // Launch kernel dim3 threadsPerBlock(16, 16); dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y); matrixMul<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, N); // Copy result from device to host cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); // Verify result for (int i = ; i < N; i++) { for (int j = ; j < N; j++) { float sum = ; for (int k = ; k < N; k++) { sum += a[i * N + k] * b[k * N + j]; } if (c[i * N + j] != sum) { printf("Error: c[%d][%d] = %f, expected %f\n", i, j, c[i * N + j], sum); return 1; } } } printf("Success!\n"); // Free memory free(a); free(b); free(c); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return ; } ``` 这个程序使用 CUDA 在 GPU 上计算两个矩阵的乘积。程序首先在主机上分配内存,然后初始化矩阵。接着,程序在设备上分配内存,并将矩阵从主机复制到设备。然后,程序启动一个 CUDA 核函数,在 GPU 上计算矩阵乘积。最后,程序将结果从设备复制回主机,并验证结果是否正确。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值