java计算加速减速_java – 使用JOCL / OPENCL计算强度的加速总和

嗨,我是JOCL(opencl)的新手.我写了这段代码来得出每幅图像强度的总和.内核采用所有图像的所有像素的一维数组.图像为300×300,因此每张图像为90000像素.目前它比我按顺序执行时更慢.

我的代码

package PAR;

/*

* JOCL - Java bindings for OpenCL

*

* Copyright 2009 Marco Hutter - http://www.jocl.org/

*/

import IMAGE_IO.ImageReader;

import IMAGE_IO.Input_Folder;

import static org.jocl.CL.*;

import org.jocl.*;

/**

* A small JOCL sample.

*/

public class IPPARA {

/**

* The source code of the OpenCL program to execute

*/

private static String programSource =

"__kernel void "

+ "sampleKernel(__global uint *a,"

+ " __global uint *c)"

+ "{"

+ "__private uint intensity_core=0;"

+ " uint i = get_global_id(0);"

+ " for(uint j=i*90000; j < (i+1)*90000; j++){ "

+ " intensity_core += a[j];"

+ " }"

+ "c[i]=intensity_core;"

+ "}";

/**

* The entry point of this sample

*

* @param args Not used

*/

public static void main(String args[]) {

long numBytes[] = new long[1];

ImageReader imagereader = new ImageReader() ;

int srcArrayA[] = imagereader.readImages();

int size[] = new int[1];

size[0] = srcArrayA.length;

long before = System.nanoTime();

int dstArray[] = new int[size[0]/90000];

Pointer srcA = Pointer.to(srcArrayA);

Pointer dst = Pointer.to(dstArray);

// Obtain the platform IDs and initialize the context properties

System.out.println("Obtaining platform...");

cl_platform_id platforms[] = new cl_platform_id[1];

clGetPlatformIDs(platforms.length, platforms, null);

cl_context_properties contextProperties = new cl_context_properties();

contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);

// Create an OpenCL context on a GPU device

cl_context context = clCreateContextFromType(

contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);

if (context == null) {

// If no context for a GPU device could be created,

// try to create one for a CPU device.

context = clCreateContextFromType(

contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);

if (context == null) {

System.out.println("Unable to create a context");

return;

}

}

// Enable exceptions and subsequently omit error checks in this sample

CL.setExceptionsEnabled(true);

// Get the list of GPU devices associated with the context

clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);

// Obtain the cl_device_id for the first device

int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;

cl_device_id devices[] = new cl_device_id[numDevices];

clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],

Pointer.to(devices), null);

// Create a command-queue

cl_command_queue commandQueue =

clCreateCommandQueue(context, devices[0], 0, null);

// Allocate the memory objects for the input- and output data

cl_mem memObjects[] = new cl_mem[2];

memObjects[0] = clCreateBuffer(context,

CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

Sizeof.cl_uint * srcArrayA.length, srcA, null);

memObjects[1] = clCreateBuffer(context,

CL_MEM_READ_WRITE,

Sizeof.cl_uint * (srcArrayA.length/90000), null, null);

// Create the program from the source code

cl_program program = clCreateProgramWithSource(context,

1, new String[]{programSource}, null, null);

// Build the program

clBuildProgram(program, 0, null, null, null, null);

// Create the kernel

cl_kernel kernel = clCreateKernel(program, "sampleKernel", null);

// Set the arguments for the kernel

clSetKernelArg(kernel, 0,

Sizeof.cl_mem, Pointer.to(memObjects[0]));

clSetKernelArg(kernel, 1,

Sizeof.cl_mem, Pointer.to(memObjects[1]));

// Set the work-item dimensions

long local_work_size[] = new long[]{1};

long global_work_size[] = new long[]{(srcArrayA.length/90000)*local_work_size[0]};

// Execute the kernel

clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,

global_work_size, local_work_size, 0, null, null);

// Read the output data

clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0,

(srcArrayA.length/90000) * Sizeof.cl_float, dst, 0, null, null);

// Release kernel, program, and memory objects

clReleaseMemObject(memObjects[0]);

clReleaseMemObject(memObjects[1]);

clReleaseKernel(kernel);

clReleaseProgram(program);

clReleaseCommandQueue(commandQueue);

clReleaseContext(context);

long after = System.nanoTime();

System.out.println("Time: " + (after - before) / 1e9);

}

}

在答案中的建议之后,通过CPU的并行代码几乎与顺序代码一样快.是否还有其他改进措施?

解决方法:

for(uint j=i*90000; j < (i+1)*90000; j++){ "

+ " c[i] += a[j];"

1)您使用全局内存(c [])求和,这很慢.使用私有变量使其更快.

像这样的东西:

"__kernel void "

+ "sampleKernel(__global uint *a,"

+ " __global uint *c)"

+ "{"

+ "__private uint intensity_core=0;"

+ " uint i = get_global_id(0);"

+ " for(uint j=i*90000; j < (i+1)*90000; j++){ "

+ " intensity_core += a[j];"

//but we cannot get rid of a[] so the calculation time cannot be less than %50

+ " }"

+ "c[i]=intensity_core;"

+ "}"; //expecting %100 speedup

现在你有c [图像数]数组的强度和.

你的本地工作大小为1,如果你有至少160张图像(这是你的gpu的核心号码),那么计算将使用所有核心.

您将需要90000 * num_images次读取和num_images写入以及90000 * num_images寄存器读/写.使用寄存器会使内核时间缩短一半.

2)每2个内存访问只进行1次数学运算.你需要每1个内存访问至少10个数学,才能使用你gpu的一小部分峰值Gflops(6490M的250 Gflops峰值)

你的i7 cpu可以轻松拥有100 Gflops,但你的记忆力将成为瓶颈.当您通过pci-express发送整个数据时,情况会更糟.(HD Graphics 3000的额定值为125 GFLOPS)

// Obtain a device ID

cl_device_id devices[] = new cl_device_id[numDevices];

clGetDeviceIDs(platform, deviceType, numDevices, devices, null);

cl_device_id device = devices[deviceIndex];

//one of devices[] element must be your HD3000.Example: devices[0]->gpu devices[1]->cpu

//devices[2]-->HD3000

在你的程序中:

// Obtain the cl_device_id for the first device

int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;

cl_device_id devices[] = new cl_device_id[numDevices];

clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],

Pointer.to(devices), null);

第一个设备可能是gpu.

标签:java,opencl,jocl

来源: https://codeday.me/bug/20190704/1372857.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值