OpenCL矩阵乘法的例子

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u011028771/article/details/52804155

前两篇博客中介绍过矩阵转置的两种方法,矩阵乘法可以先对矩阵做转置运算,然后再对应相乘;
矩阵大小是658192
先对8192
65的矩阵转置为65*8192;
然后由65个工作项,每个工作项负责一行数据的乘法;
完成65行数据的乘法;
转置是由8192个工作项完成的;
代码中给出了C语言实现矩阵转置的测试代码;
感兴趣的朋友可以测试一下C语言代码的运算时间和GPU运算时间的差别;
下面是代码:

houst.c
#include<stdio.h>
#include<windows.h>
#include<CL/cl.h>
#pragma warning( disable : 4996 )
#define MIXSIZE 8192*65

int main() {
	cl_int error;
	cl_platform_id platforms;
	cl_device_id devices;
	cl_context context;
	FILE *program_handle;
	size_t program_size;
	char *program_buffer;
	cl_program program;
	size_t log_size;
	char *program_log;
	char tKernel_name[] = "mixtran";
	char mKernel_name[] = "mixmul";
	cl_kernel tKernel;
	cl_kernel mKernel;
	cl_command_queue queue;
	//获取平台
	error = clGetPlatformIDs(1, &platforms, NULL);
	if (error != 0) {
		printf("Get platform failed!");
		return -1;
	}
	error = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);
	if (error != 0) {
		printf("Get device failed!");
		return -1;
	}
	//创建上下文
	context = clCreateContext(NULL,1,&devices,NULL,NULL,&error);
	if (error != 0) {
		printf("Creat context failed!");
		return -1;
	}
	//创建程序
	program_handle = fopen("kernel.cl","rb");
	if (program_handle == NULL) {
		printf("The kernle can not be opened!");
		return -1;
	}
	fseek(program_handle,0,SEEK_END);
	program_size = ftell(program_handle);
	rewind(program_handle);

	program_buffer = (char *)malloc(program_size+1);
	program_buffer[program_size] = '\0';
	error=fread(program_buffer,sizeof(char),program_size,program_handle);
	if (error == 0) {
		printf("Read kernel failed!");
		return -1;
	}
	fclose(program_handle);
	program = clCreateProgramWithSource(context,1,(const char **)&program_buffer,&program_size,&error);
	if (error < 0) {
		printf("Couldn't create the program!");
		return -1;
	}
	//编译程序
	error = clBuildProgram(program,1,&devices,NULL,NULL,NULL);
	if (error < 0) {
		//确定日志文件的大小
		clGetProgramBuildInfo(program,devices,CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
		program_log = (char *)malloc(log_size+1);
		program_log[log_size] = '\0';
		//读取日志
		clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
		printf("%s\n",program_log);
		free(program_log);
		getchar();
		return -1;
	}
	//创建命令队列
	queue = clCreateCommandQueue(context, devices, CL_QUEUE_PROFILING_ENABLE, &error);
	if (error < 0) {
		printf("Coudn't create the command queue");
		return -1;
	}

	/*本次实验进行矩阵乘法(A*B);
	*****矩阵乘法分为两步进行
	*****矩阵转置
	*****矩阵相乘
	***********************************
	*****参数说明*************
	**矩阵A=input1
	**矩阵B=input2
	**转置结果为C=input3;
	**相乘的输出结果为result
	**kernel1对B做转置
	**kernel2做乘法
	**矩阵大小65*8192
	*/

	//创建内核
	tKernel = clCreateKernel(program,tKernel_name,&error);
	if (tKernel==NULL) {
		printf("Couldn't create kernel!\n");
		return -1;
	}

	mKernel = clCreateKernel(program, mKernel_name, &error);
	if (mKernel == NULL) {
		printf("Couldn't create kernel!\n");
		return -1;
	}

	//创建缓存对象
	cl_mem memObject1 = clCreateBuffer(context,CL_MEM_READ_ONLY ,
		                                                            sizeof(float) * MIXSIZE,NULL,&error);
	if (error < 0) {
		printf("Creat memObject1 failed!\n");
		return -1;
	}
	cl_mem memObject2 = clCreateBuffer(context, CL_MEM_READ_ONLY,
		                                                           sizeof(float) * MIXSIZE, NULL, &error);
	if (error < 0) {
		printf("Creat memObject1 failed!\n");
		return -1;
	}
	cl_mem memObject3 = clCreateBuffer(context, CL_MEM_READ_WRITE , 
		                                                            sizeof(float) * MIXSIZE, NULL, &error);
	if (error < 0) {
		printf("Creat memObject2 failed!\n");
		return -1;
	}
	cl_mem memObject4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY , 
		                                                            sizeof(float) * 65*65, NULL, &error);
	if (error < 0) {
		printf("Creat memObject3 failed!\n");
		return -1;
	}
	//设置内核参数
	error = clSetKernelArg(tKernel,0,sizeof(cl_mem),&memObject1);
	error|= clSetKernelArg(tKernel, 1, sizeof(cl_mem), &memObject3);
	if (error != CL_SUCCESS) {
		printf("Error setting kernel arguments!\n");
		return -1;
	}

	error  = clSetKernelArg(mKernel, 0, sizeof(cl_mem), &memObject2);
	error |= clSetKernelArg(mKernel, 1, sizeof(cl_mem), &memObject3);
	error |= clSetKernelArg(mKernel, 2, sizeof(cl_mem), &memObject4);
	if (error != CL_SUCCESS) {
		printf("Error setting kernel arguments!\n");
		return -1;
	}
	
	//初始化参数
	float* input1 = (float *)malloc(sizeof(float)*MIXSIZE);
	float* input2 = (float *)malloc(sizeof(float)*MIXSIZE);
	float* input3 = (float *)malloc(sizeof(float)*MIXSIZE);
	float* result = (float *)malloc(sizeof(float)*65*65);
	float* check = (float *)malloc(sizeof(float)*65*65);
	memset(input3, 0, sizeof(float)*MIXSIZE);
	memset(result, 0, sizeof(float)*65*65);
	memset(check, 0, sizeof(float) * 65 * 65);
	//数据读入
	//采用随机数函数产生输入
	
	//input1是8192*65
	for (int i = 0; i < 8192; i++) {
		srand(i);
		for (int j = 0; j < 65; j++) {
			input1[65 * i + j] = 20 * rand() / (double)(RAND_MAX);
		}
	}
	//input2是65*8192
	for (int i = 0; i < 65; i++) {
		srand(i);
		for (int j = 0; j < 8192; j++) {
			input2[8192 * i + j] = 20 * rand() / (double)(RAND_MAX);
		}
	}
	/*
	iput2*input1得到65*65的矩阵
	先对input1做转置,然后行和行乘加和;
	*/

	for (int i = 0; i < 8192; i++) { 
		for (int j = 0; j < 65; j++) {
			input3[j*8192 + i] = input1[i*65 + j];
		}
	}
	for (int i = 0; i < 65; i++) {
		for (int k = 0; k < 65; k++) {
			for (int j = 0; j < 8192; j++) {
				check[i * 65 + k] += input2[j + i * 8192] * input3[j + k * 8192];
			}
		}
		
	}
	cl_int status = 0;
	cl_event evt1 ;
	cl_event evt2;
	cl_event evt3;

	//数据写入缓冲对像
	error = clEnqueueWriteBuffer(queue, memObject1, CL_FALSE, 0, 
		          MIXSIZE * sizeof(float), input1, 0, NULL, &evt1);
	if (error != CL_SUCCESS) {
		printf("write data failed!\n");
		return -1;
	}

	error = clEnqueueWriteBuffer(queue, memObject2, CL_FALSE, 0,
		         MIXSIZE * sizeof(float), input2, 0, NULL, &evt1);
	if (error != CL_SUCCESS) {
		printf("write data failed!\n");
		return -1;
	}

	//配置工作项(转置)
	size_t maxWorkGroupSize = 0;
	clGetDeviceInfo(devices, CL_DEVICE_MAX_WORK_GROUP_SIZE,
		sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
	size_t globalWorkSize = MIXSIZE / 65;
	size_t localWorkSize = maxWorkGroupSize;
	//执行内核
	error = clEnqueueNDRangeKernel(queue, tKernel, 1, NULL, &globalWorkSize,
		             &localWorkSize, 1, &evt1, &evt2);
	if (error != CL_SUCCESS) {
		printf("Error queuing kernel for execution!\n");
		return -1;
	}
	
	//配置工作项(乘法)
	 globalWorkSize = 65;
	 localWorkSize = NULL;
	//执行内核
	error = clEnqueueNDRangeKernel(queue, mKernel, 1, NULL, &globalWorkSize,
		NULL, 1, &evt2, &evt3);
	if (error != CL_SUCCESS) {
		printf("Error queuing kernel for execution!\n");
		return -1;
	}

	//读取内核结果
	 error = clEnqueueReadBuffer(queue, memObject4, CL_TRUE, 0,
		 65*65 * sizeof(float), result, 1, &evt3, NULL);
	 if (error != CL_SUCCESS) {
		 printf("Error reading result buffer!\n");
		 return -1;
	 }

	//检查结果
	for (int i = 0; i < 65*65; i++) {
		if (result[i] != check[i]) { 
			printf("failed!\n");
			printf("%f,%f,%d\n",result[i],check[i],i);
			getchar();
			return 0;
		}
	}
	printf("successed!\n");

	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	clReleaseDevice(devices);
	clReleaseKernel(tKernel);
	clReleaseKernel(mKernel);
	
	getchar();
	return 0;
}
kernel.cl
//矩阵转置
__kernel void mixtran(__global const float *input,
	__global  float *inputT) {
	int gid = get_global_id(0);
	for (int i = 0; i < 65; i++) {
		inputT[i*8192+gid] = input[gid*65+i];
	}
}

__kernel void mixmul(__global const float *input1,
	__global const float *input2,
	__global float *result) {
	int gid = get_global_id(0);
	for (int j = 0; j < 65; j ++ ) {
		for (int i = 0; i < 8192; i++) {
			result[gid*65+j] += input1[i + gid * 8192] * input2[i + j * 8192];
		}
	}
}

欢迎关注公众号:计算机视觉及高性能计算(to_2know)

在这里插入图片描述

OpenCL 矩阵乘法

04-02

OpenCL的矩阵乘法,结果每次都一样,而且都不对,不知道问题在哪,求大神解答,谢谢。rnrn[code=c]rn#include rn#include rn#include rn#include rn#include rnusing namespace std;rnrn#define MAX_SOURCE_SIZE 10000rnrnsize_t ReadSource(const char* sourceName, char* sourceStr)rnrn FILE* fp;rn size_t sourceSize;rn fp = fopen(sourceName, "r");rnrn if (!fp)rn rn fprintf(stderr, "Failed to load kernel\n");rn exit(1);rn rnrn sourceSize = fread(sourceStr, 1, MAX_SOURCE_SIZE, fp);rn fclose(fp);rn return sourceSize;rnrnrnint main(int argc, char* argv[])rnrn int M = 2, N = 2, P = 2;rnrn // 输入矩阵rn float *A;rn float *B;rn // 输出矩阵rn float *C;rnrn // 矩阵大小rn int szA = N * P;rn int szB = P * M;rn int szC = N * M;rnrn // 设备输入缓冲区rn cl_mem d_a;rn cl_mem d_b;rn // 设备输出缓冲区rn cl_mem d_c;rnrn cl_platform_id cpPlatform; // OpenCL 平台rn cl_device_id device_id; // device IDrn cl_context context; // contextrn cl_command_queue queue; // command queuern cl_program program; // programrn cl_kernel kernel; // kernelrnrn //(为每个矩阵分配内存)rn A = (float*)malloc(szA * sizeof(float));rn B = (float*)malloc(szB * sizeof(float));rn C = (float*)malloc(szC * sizeof(float));rnrn //(初始化矩阵)rn for (int i = 0; i < szA; i++)rn rn A[i] = i;rn cout << A[i] << " ";rn rn cout << endl;rn for (int i = 0; i < szB; i++)rn rn B[i] = i;rn cout << B[i] << " ";rn rn cout << endl;rnrn cl_int err;rnrn //(获得平台ID)rn err = clGetPlatformIDs(1, &cpPlatform, NULL);rn cout << "platformID: " << cpPlatform << endl;rnrn //(获得设备ID,与平台有关)rn err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);rn cout << "deviceID: " << device_id << endl;rnrn //(根据设备ID,得到上下文)rn context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);rnrn //(根据上下文,在设备上创建命令队列)rn queue = clCreateCommandQueue(context, device_id, 0, &err);rnrn //(根据OpenCL源程序创建计算程序)rn char* kernelSource = new char[10000];rn int size_n = ReadSource("MatrixMul.cl", kernelSource);rn size_t sourceSize[] = size_n ;rnrn program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, sourceSize, &err);rn if (err != CL_SUCCESS)rn rn printf("clCreateProgramWithSource error");rn return -1;rn rnrn //(创建可执行程序)rn clBuildProgram(program, 0, NULL, NULL, NULL, NULL);rnrn //(在上面创建的程序中创建内核程序)rn kernel = clCreateKernel(program, "MatrixMul", &err);rn if (err != CL_SUCCESS) rn cout << "clCreateKernel error" << endl;rn return -1;rn rnrn //(分配设备缓冲)rn d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, szA, NULL, NULL);rn d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, szB, NULL, NULL);rn d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, szC, NULL, NULL);rnrn // (将矩阵信息写入设备缓冲)rn err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,rn szA, A, 0, NULL, NULL);rn err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,rn szB, B, 0, NULL, NULL);rn if (err != CL_SUCCESS) rn cout << "clEnqueueWriteBuffer err" << endl;rn return -1;rn rnrn // (设置计算内核的参数)rn err = clSetKernelArg(kernel, 0, sizeof(int), &M);rn err |= clSetKernelArg(kernel, 1, sizeof(int), &N);rn err |= clSetKernelArg(kernel, 2, sizeof(int), &P);rn err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);rn err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);rn err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);rnrn if (err != CL_SUCCESS) rn cout << "clSetKernelArg err" << endl;rn return -1;rn rnrn // (在数据集的范围内执行内核)Execute the kernel over the entire range of the data setrn size_t globalThreads[2] = N, M ;rn size_t localThreads[2] = 16, 16 ;rn err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, NULL,rn 0, NULL, NULL);rn if (err != CL_SUCCESS) rn cout << "clEnqueueNDRangeKernel err" << endl;rn return -1;rn rnrn // (在读出结果之前,等待命令队列执行完毕)Wait for the command queue to get serviced before reading back resultsrn clFinish(queue);rnrn // (从设备缓冲区读出结果)Read the results from the devicern clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,rn szC, C, 0, NULL, NULL);rnrn //(输出读出的结果)rn float sum = 0;rnrn for (int i = 0; i < szC; i++)rn printf("%d ", C[i]);rnrn cout << endl;rnrn // (释放资源)rn clReleaseMemObject(d_a);rn clReleaseMemObject(d_b);rn clReleaseMemObject(d_c);rn clReleaseProgram(program);rn clReleaseKernel(kernel);rn clReleaseCommandQueue(queue);rn clReleaseContext(context);rnrn //(释放内存)rn free(A);rn free(B);rn free(C);rnrn return 0;rnrn[/code]rnrn[code=c]rn__kernel void MatrixMul(const int Mdim, const int Ndim, const int Pdim,rn __global float *A, __global float *B, __global float *C)rnrn int k;rn int i = get_global_id(0);rn int j = get_global_id(1);rn float tmp;rnrn if ((i < Ndim) && (j < Mdim))rn rn tmp = 0.0;rn for (k = 0; k < Ndim; k++)rn tmp += A[i*Ndim+k] * B[k*Pdim+j];rn C[i*Ndim+j] = tmp;rn rnrn[/code]

没有更多推荐了,返回首页

私密
私密原因:
请选择设置私密原因
  • 广告
  • 抄袭
  • 版权
  • 政治
  • 色情
  • 无意义
  • 其他
其他原因:
120
出错啦
系统繁忙,请稍后再试