所谓矩阵,在内存中也是一串数字;所以转置就是将数据的存储相对位置进行调换。上一篇博客介绍了按照转置定义方法进行转置的技巧;但是那种方法仅限于方阵,对于一般矩阵转置相对比较困难(可以将一般矩阵分块);所以今天介绍一般矩阵转置方法;
方法一
http://blog.csdn.net/u011028771/article/details/52733929
矩阵大小为65行8192列;
采用256个工作项;每个工作项完成32列数据的转置;
host.c
#include<stdio.h>
#include<windows.h>
#include<CL/cl.h>
#pragma warning( disable : 4996 )
#define MIXSIZE 8192*65
int main() {
cl_int error;
cl_platform_id platforms;
cl_device_id devices;
cl_context context;
FILE *program_handle;
size_t program_size;
char *program_buffer;
cl_program program;
size_t log_size;
char *program_log;
char kernel_name[] = "createBuffer";
cl_kernel kernel;
cl_command_queue queue;
//获取平台
error = clGetPlatformIDs(1, &platforms, NULL);
if (error != 0) {
printf("Get platform failed!");
return -1;
}
error = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);
if (error != 0) {
printf("Get device failed!");
return -1;
}
//创建上下文
context = clCreateContext(NULL,1,&devices,NULL,NULL,&error);
if (error != 0) {
printf("Creat context failed!");
return -1;
}
//创建程序
program_handle = fopen("kernel.cl","rb");
if (program_handle == NULL) {
printf("The kernle can not be opened!");
return -1;
}
fseek(program_handle,0,SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char *)malloc(program_size+1);
program_buffer[program_size] = '\0';
error=fread(program_buffer,sizeof(char),program_size,program_handle);
if (error == 0) {
printf("Read kernel failed!");
return -1;
}
fclose(program_handle);
program = clCreateProgramWithSource(context,1,(const char **)&program_buffer,&program_size,&error);
if (error < 0) {
printf("Couldn't create the program!");
return -1;
}
//编译程序
error = clBuildProgram(program,1,&devices,NULL,NULL,NULL);
if (error < 0) {
//确定日志文件的大小
clGetProgramBuildInfo(program,devices,CL_PROGRAM_BUILD_LOG,
0,NULL,&log_size);
program_log = (char *)malloc(log_size+1);
program_log[log_size] = '\0';
//读取日志
clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s\n",program_log);
free(program_log);
getchar();
return -1;
}
//创建命令队列
queue = clCreateCommandQueue(context, devices, CL_QUEUE_PROFILING_ENABLE, &error);
if (error < 0) {
printf("Coudn't create the command queue");
return -1;
}
//创建内核
kernel = clCreateKernel(program,kernel_name,&error);
if (kernel==NULL) {
printf("Couldn't create kernel!\n");
return -1;
}
//创建缓存对象
cl_mem memObject1 = clCreateBuffer(context,CL_MEM_READ_ONLY ,
sizeof(float) * MIXSIZE,NULL,&error);
if (error < 0) {
printf("Creat memObject1 failed!\n");
return -1;
}
cl_mem memObject2 = clCreateBuffer(context, CL_MEM_READ_WRITE , sizeof(float) * MIXSIZE, NULL, &error);
if (error < 0) {
printf("Creat memObject2 failed!\n");
return -1;
}
//设置内核参数
error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObject1);
error|= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObject2);
//error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObject3);
if (error != CL_SUCCESS) {
printf("Error setting kernel arguments!\n");
return -1;
}
/*本次实验进行矩阵乘法(A*B);
*****矩阵乘法分为两步进行
*****矩阵转置
*****矩阵相乘
***********************************
*****参数说明*************
**矩阵A=input1
**矩阵B=input2
**转置结果为C=input3;
**相乘的输出结果为result
**kernel1对B做转置
**kernel2做乘法
**矩阵大小65*8192
*/
//初始化参数
float* input1 = (float *)malloc(sizeof(float)*MIXSIZE);
float* input2 = (float *)malloc(sizeof(float)*MIXSIZE);
float* input3 = (float *)malloc(sizeof(float)*MIXSIZE);
float* result = (float *)malloc(sizeof(float)*MIXSIZE);
float* check = (float *)malloc(sizeof(float)*MIXSIZE);
memset(input3, 0, sizeof(float)*MIXSIZE);
memset(result, 0, sizeof(float)*MIXSIZE);
//数据读入
//采用随机数函数产生输入
for (int i = 0; i < 65; i++) {
srand(i);
for (int j = 0; j < 8192; j++) {
input1[8192 * i + j] = 20*rand() / (double)(RAND_MAX);
input2[8192 * i + j] = 20 * rand() / (double)(RAND_MAX);
//input1[8192 * i + j] = 2;
//input2[8192 * i + j] = 2;
}
}
for (int i = 0; i < 8192; i++) {
for (int j = 0; j < 65; j++) {
check[i * 65 + j] = input2[j * 8192 + i];
}
}
cl_int status = 0;
cl_event evt1 ;
cl_event evt2;
//配置工作项
size_t maxWorkGroupSize = 0;
clGetDeviceInfo(devices, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
size_t globalWorkSize = MIXSIZE;
size_t localWorkSize = maxWorkGroupSize;
//数据写入缓冲对像
error = clEnqueueWriteBuffer(queue, memObject1, CL_FALSE, 0, MIXSIZE * sizeof(float), input2, 0, NULL, &evt1);
if (error != CL_SUCCESS) {
printf("write data failed!\n");
return -1;
}
//执行内核
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &localWorkSize,&localWorkSize, 1, &evt1, &evt2);
if (error != CL_SUCCESS) {
printf("Error queuing kernel for execution!\n");
return -1;
}
//读取内核结果
error = clEnqueueReadBuffer(queue, memObject2, CL_TRUE, 0, MIXSIZE * sizeof(float), input3, 0, NULL, NULL);
if (error != CL_SUCCESS) {
printf("Error reading result buffer!\n");
return -1;
}
//检查结果
for (int i = 0; i < MIXSIZE; i++) {
if (input3[i] != check[i]) {
printf("failed!\n");
printf("%f,%f,%d\n",result[i],check[i],i);
getchar();
return 0;
}
}
printf("successed!\n");
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseDevice(devices);
clReleaseKernel(kernel);
getchar();
return 0;
}
注:
代码中有些变量没有用,是声明为后续进行的矩阵乘法用的;下一篇博客将进行矩阵乘法的实验;
kernel.cl
//矩阵转置
__kernel void createBuffer(__global const float *input,
__global float *inputT) {
int gid = get_global_id(0);
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 65; j++) {
inputT[(gid * 32 + i )* 65 + j] = input[j * 8192 + i + gid * 32];
}
}
}
同样,我们可以申请8192个全局工作项来完成该工作;
kernel.cl
//矩阵转置
__kernel void createBuffer(__global const float *input,
__global float *inputT) {
int gid = get_global_id(0);
for (int i = 0; i < 65; i++) {
inputT[gid * 65 + i] = input[i * 8192 + gid];
}
}