openCL矩阵乘法

最新推荐文章于 2024-09-10 22:15:04 发布

努力搬砖ll

最新推荐文章于 2024-09-10 22:15:04 发布

阅读量128

点赞数

文章标签：矩阵算法

本文链接：https://blog.csdn.net/weixin_49562509/article/details/134069811

版权

本文详细介绍了使用OpenCL在GPU上实现矩阵乘法的过程，包括创建平台、设备、内存缓冲区、构建程序、执行内核，并对比了CPU和OpenCL的执行时间。

摘要由CSDN通过智能技术生成

#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

const int SIZE = 512;

const char* kernelSource =
"__kernel void matrixMultiply(__global int *A, __global int *B, __global int *C, int width) {\n"
"    int row = get_global_id(0);\n"
"    int col = get_global_id(1);\n"
"    int sum = 0;\n"
"    for (int i = 0; i < width; ++i) {\n"
"        sum += A[row * width + i] * B[i * width + col];\n"
"    }\n"
"    C[row * width + col] = sum;\n"
"}\n";

int main() {
    printf("OpenCL矩阵乘法\n");
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    // 初始化OpenCL环境
    clGetPlatformIDs(1, &platform, NULL);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

    // 创建矩阵并分配内存
    int A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE];
    for (int i = 0; i < SIZE; ++i) {
        for (int j = 0; j < SIZE; ++j) {
            A[i][j] = 1;
            B[i][j] = 1;
        }
    }

    // 创建缓冲区
    cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE * SIZE, A, NULL);
    cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE * SIZE, B, NULL);
    cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE * SIZE, NULL, NULL);

    // 创建OpenCL程序并构建
    program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, NULL, NULL);
    clBuildProgram(program, 1, &device, NULL, NULL, NULL);

    // 创建内核
    kernel = clCreateKernel(program, "matrixMultiply", NULL);

    // 设置内核参数
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
    clSetKernelArg(kernel, 3, sizeof(int), &SIZE);

    // 创建命令队列
    queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

    // 启动内核
    size_t globalWorkSize[2] = { SIZE, SIZE };
    cl_event event;
    clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &event);
    clWaitForEvents(1, &event);

    // 从设备内存中读取结果
    clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(int) * SIZE * SIZE, C, 0, NULL, NULL);

    // 测量CPU执行时间
    clock_t start_cpu = clock();
    // 执行矩阵乘法在CPU上
    int result[SIZE][SIZE];
    for (int i = 0; i < SIZE; ++i) {
        for (int j = 0; j < SIZE; ++j) {
            int sum = 0;
            for (int k = 0; k < SIZE; ++k) {
                sum += A[i][k] * B[k][j];
            }
            result[i][j] = sum;
        }
    }
    clock_t end_cpu = clock();
    double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC;
    printf("CPU执行时间：%f秒\n", cpu_time);

    // 测量OpenCL执行时间
    cl_ulong start_time, end_time;
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, NULL);
    double opencl_time = (double)(end_time - start_time) * 1.0e-9; // 转换为秒
    printf("OpenCL执行时间：%f秒\n", opencl_time);

    // 释放OpenCL资源
    clReleaseMemObject(bufferA);
    clReleaseMemObject(bufferB);
    clReleaseMemObject(bufferC);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}