#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
const int SIZE = 512;
const char* kernelSource =
"__kernel void matrixMultiply(__global int *A, __global int *B, __global int *C, int width) {\n"
" int row = get_global_id(0);\n"
" int col = get_global_id(1);\n"
" int sum = 0;\n"
" for (int i = 0; i < width; ++i) {\n"
" sum += A[row * width + i] * B[i * width + col];\n"
" }\n"
" C[row * width + col] = sum;\n"
"}\n";
int main() {
printf("OpenCL矩阵乘法\n");
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
// 初始化OpenCL环境
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// 创建矩阵并分配内存
int A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE];
for (int i = 0; i < SIZE; ++i) {
for (int j = 0; j < SIZE; ++j) {
A[i][j] = 1;
B[i][j] = 1;
}
}
// 创建缓冲区
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE * SIZE, A, NULL);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE * SIZE, B, NULL);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE * SIZE, NULL, NULL);
// 创建OpenCL程序并构建
program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, NULL, NULL);
clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// 创建内核
kernel = clCreateKernel(program, "matrixMultiply", NULL);
// 设置内核参数
clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
clSetKernelArg(kernel, 3, sizeof(int), &SIZE);
// 创建命令队列
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
// 启动内核
size_t globalWorkSize[2] = { SIZE, SIZE };
cl_event event;
clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &event);
clWaitForEvents(1, &event);
// 从设备内存中读取结果
clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(int) * SIZE * SIZE, C, 0, NULL, NULL);
// 测量CPU执行时间
clock_t start_cpu = clock();
// 执行矩阵乘法在CPU上
int result[SIZE][SIZE];
for (int i = 0; i < SIZE; ++i) {
for (int j = 0; j < SIZE; ++j) {
int sum = 0;
for (int k = 0; k < SIZE; ++k) {
sum += A[i][k] * B[k][j];
}
result[i][j] = sum;
}
}
clock_t end_cpu = clock();
double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;
printf("CPU执行时间:%f秒\n", cpu_time);
// 测量OpenCL执行时间
cl_ulong start_time, end_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, NULL);
double opencl_time = (double)(end_time - start_time) * 1.0e-9; // 转换为秒
printf("OpenCL执行时间:%f秒\n", opencl_time);
// 释放OpenCL资源
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
openCL矩阵乘法
最新推荐文章于 2024-09-10 22:15:04 发布
本文详细介绍了使用OpenCL在GPU上实现矩阵乘法的过程,包括创建平台、设备、内存缓冲区、构建程序、执行内核,并对比了CPU和OpenCL的执行时间。
摘要由CSDN通过智能技术生成