矩阵乘法-CUDA CPU&GPU加速比

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define M 8
#define K 8
#define N 8


void initial(double *array, int size)
{
	for (int i = 0; i < size; i++)
	{
		array[i] = (double)(rand() % 10 + 1);
	}
}

void printMatrix(double *array, int row, int col)
{
	double *p = array;
	for (int y = 0; y < row; y++)
	{
		for (int x = 0; x < col; x++)
		{
			printf("%10lf", p[x]);
		}
		p = p + col;
		printf("\n");
	}
	return;
}


void  multiplicateMatrixOnHost(double *array_A, double *array_B, double *array_C, int M_p, int K_p, int N_p)
{
	for (int i = 0; i < M_p; i++)
	{
		for (int j = 0; j < N_p; j++)
		{
			double sum = 0;
			for (int k = 0; k < K_p; k++)
			{
				sum += array_A[i*K_p + k] * array_B[k*N_p + j];
			}
			array_C[i*N_p + j] = sum;
		}
	}

}

__global__ void multiplicateMatrixOnDevice(double *array_A, double *array_B, double *array_C, int M_p, int K_p, int N_p)
{
	int ix = threadIdx.x + blockDim.x*blockIdx.x;//col  number
	int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number

	if (ix < N_p && iy < M_p)
	{
		double sum = 0;
		for (int k = 0; k < K_p; k++)
		{
			sum += array_A[iy*K_p + k] * array_B[k*N_p + ix];
		}
		array_C[iy*N_p + ix] = sum;
	}
}


int main(int argc, char **argv)
{
	clock_t start = 0, finish = 0;
	double time;

	int Axy = M * K;
	int ABytes = Axy * sizeof(double);

	
	int Bxy = K * N;
	int BBytes = Bxy * sizeof(double);
	double *h_A, *h_B, *hostRef, *deviceRef;
	h_A = (double*)malloc(ABytes);
	h_B = (double*)malloc(BBytes);

	int nBytes = M * N * sizeof(double);
	hostRef = (double*)malloc(nBytes);
	deviceRef = (double*)malloc(nBytes);

	initial(h_A, Axy);
	printf("\n");
	printf("Matrix_A: (%d×%d)\n", M, K);
	printMatrix(h_A, M,K);
	initial(h_B, Bxy);
	printf("Matrix_B: (%d×%d)\n", K, N);
	printMatrix(h_B, K,N);

	start = clock();
	multiplicateMatrixOnHost(h_A, h_B, hostRef, M, K, N);
	finish = clock();
	time = (double)(finish - start) / CLOCKS_PER_SEC;

	printf("\n");
	printf("------------------------------------------------------------------------------------\n");
	printf("Computing matrix product using multiplicateMatrixOnHost \n");
	printf("------------------------------------------------------------------------------------\n");

	printf("Matrix_hostRef: (%d×%d)  CPU运行时间为:%lfs\n", M, N, time);
	printMatrix(hostRef, M,N);

	double *d_A, *d_B, *d_C;
	cudaMalloc((void**)&d_A, ABytes);
	cudaMalloc((void**)&d_B, BBytes);
	cudaMalloc((void**)&d_C, nBytes);

	cudaMemcpy(d_A, h_A, ABytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, BBytes, cudaMemcpyHostToDevice);


	printf("\n\n");
	printf("------------------------------------------------------------------------------------\n");
	printf("Computing matrix product using multiplicateMatrixOnDevice \n");
	printf("------------------------------------------------------------------------------------\n");




	int dimx = 16;
	int dimy = 16;
	dim3 block(dimx, dimy);
	dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
//	dim3 grid(1, 1);

	cudaEvent_t gpustart, gpustop;
	float elapsedTime = 0.0;
	cudaEventCreate(&gpustart);
	cudaEventCreate(&gpustop);
	cudaEventRecord(gpustart, 0);

	multiplicateMatrixOnDevice << <grid, block >> > (d_A, d_B, d_C, M, K, N);
//	printf("   multiplicateMatrixOnDevice<<<(%d,%d),(%d,%d)>>>", grid.x, grid.y, block.x, block.y);
	cudaDeviceSynchronize();
	cudaEventRecord(gpustop, 0);
	cudaEventSynchronize(gpustop);

	cudaEventElapsedTime(&elapsedTime, gpustart, gpustop);
	cudaEventDestroy(gpustart);
	cudaEventDestroy(gpustop);


	cudaMemcpy(deviceRef, d_C, nBytes, cudaMemcpyDeviceToHost);
	printf("Matrix_deviceRef: (%d×%d)  <<<(%d,%d),(%d,%d)>>>  GPU运行时间为:%fs\n", 
		M, N, grid.x, grid.y, block.x, block.y, elapsedTime/1000);
	printMatrix(deviceRef, M,N);
	printf("加速比为: %lf\n\n", time / (elapsedTime / 1000));

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	free(h_A);
	free(h_B);
	free(hostRef);
	free(deviceRef);

	cudaDeviceReset();

	return (0);
}
  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
PyTorch-Mutex的CPUGPU版本之间的区别在于它们使用的锁的类型不同。CPU版本使用Python标准库中的threading.Lock来实现互斥锁,而GPU版本使用PyTorch中的torch.cuda.Lock来实现互斥锁。 因此,如果您的PyTorch模型是在CPU上训练的,则应该使用CPU版本的PyTorch-Mutex;如果您的PyTorch模型是在GPU上训练的,则应该使用GPU版本的PyTorch-Mutex。 以下是使用CPU版本的PyTorch-Mutex的示例代码: ``` from torch_mutex import Mutex mutex = Mutex() for epoch in range(num_epochs): for batch in data_loader: # 加锁 mutex.acquire() # 计算模型输出 output = model(batch) loss = loss_fn(output, target) # 更新模型参数 optimizer.zero_grad() loss.backward() optimizer.step() # 解锁 mutex.release() ``` 以下是使用GPU版本的PyTorch-Mutex的示例代码: ``` import torch from torch_mutex import Mutex mutex = Mutex() for epoch in range(num_epochs): for batch in data_loader: # 加锁 mutex.acquire() # 将输入和目标数据移动到GPU上 input = batch[0].cuda() target = batch[1].cuda() # 计算模型输出 output = model(input) loss = loss_fn(output, target) # 更新模型参数 optimizer.zero_grad() loss.backward() optimizer.step() # 解锁 mutex.release() ``` 在上面的代码中,我们在使用GPU版本的PyTorch-Mutex时,首先需要将输入和目标数据移动到GPU上。然后,在访问共享资源之前,我们调用mutex.acquire()方法来获取互斥锁。最后,在访问共享资源之后,我们调用mutex.release()方法来释放互斥锁。这样,我们就可以避免多个GPU核心同时访问共享资源,从而确保训练过程的正确性。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值