原文地址:http://blog.csdn.net/zenny_chen/article/details/6159746
在OpenCL中,用__local(或local)修饰的变量会被存放在一个计算单元(Compute Unit)的共享存储器区域中。对于nVidia的GPU,一个CU可以被映射为物理上的一块SM(Stream Multiprocessor);而对于AMD-ATi的GPU可以被映射为物理上的一块SIMD。不管是SM也好,SIMD也罢,它们都有一个在本计算单元中被所有线程(OpenCL中称为Work Item)所共享的共享存储器。因此,在一个计算单元内,可以通过local shared memory来同步此计算单元内的所有工作项。
这里必须注意的是在计算单元之间的线程的通信只能通过全局存储器进行,因为每个计算单元之间是没有共享存储器的,呵呵。
下面我将证明Apple的OpenCL实现中,如果有两个Work Group(一个Work Group的处理交给一个计算单元执行),那么这两个Work Group正好能分别被映射到一个计算单元内。我用的是Mac Mini,GPU为GeForce 9400M,所有仅有两个SM,呵呵。
下面先给出kernel代码:
__kernel void solve_sum(
__global volatile unsigned buffer[512],
__global unsigned dest[512]
)
{
__local volatile int flag = 0;
size_t gid = get_global_id(0);
if(0 <= gid && gid < 32)
{
while(flag != 1);
flag = 0;
buffer[gid] = 0x1UL;
//write_mem_fence(CLK_GLOBAL_MEM_FENCE);
}
else if(32 <= gid && gid < 64)
{
flag = 1;
while(flag != 0);
unsigned ret = buffer[31 + 32 - gid];
dest[gid - 32] = ret;
}
}
上面这个内核程序的配置为:分为两个工作组;每组32个工作项。这样,两个工作组能进不同的SM。各位在执行这段代码时会发生死循环。然后等2到3秒后程序会自动退出,这点不用担心,呵呵。原因就是两个SM的共享变量flag是各有各的一份。假定,线程0到线程31进SM0,那么SM0的所有线程共享这个flag变量;而线程32到线程63进SM1,那么SM1的flag将被SM1的所有线程共享。而如果企图把这个(其实是两个)共享变量用于两个SM之间的通信,显然是无法成功的,呵呵。尽管代码上只写了一个flag,但实际上却有两个副本。
下面提供主机端代码:
#import <Foundation/Foundation.h>
#include <OpenCL/opencl.h>
static unsigned __attribute__((aligned(16))) buffer[512] = { 0 }; // original data set given to device
static unsigned __attribute__((aligned(16))) dest[512] = { 0 };
int opencl_execution(void)
{
int err; // error code returned from api calls
size_t local; // local domain size for our calculation
cl_platform_id platform_id; // added by zenny_chen
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem memOrg, memDst; // device memory used for the input array
// Create a platform
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a platform!/n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!/n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext((cl_context_properties[]){(cl_context_properties)CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0}, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!/n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!/n");
return EXIT_FAILURE;
}
// Fetch kernel source
NSString *filepath = [[NSBundle mainBundle] pathForResource:@"kernel" ofType:@"cl"];
if(filepath == NULL)
{
puts("Source not found!");
return EXIT_FAILURE;
}
const char *KernelSource = (const char*)[[NSString stringWithContentsOfFile:filepath encoding:NSUTF8StringEncoding error:nil] UTF8String];
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!/n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!/n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s/n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
kernel = clCreateKernel(program, "solve_sum", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!/n");
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
memOrg = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * 512, NULL, NULL);
memDst = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * 512, NULL, NULL);
if (memOrg == NULL || memDst == NULL)
{
printf("Error: Failed to allocate device memory!/n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, memOrg, CL_TRUE, 0, sizeof(int) * 512, buffer, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!/n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memOrg);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memDst);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d/n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d/n", err);
exit(1);
}
else
printf("The number of work items in a work group is: %lu/r/n", local);
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, (size_t[]){ 64 }, (size_t[]){ 32 }, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!/n");
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clFinish(commands);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer(commands, memDst, CL_TRUE, 0, sizeof(int) * 512, dest, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d/n", err);
exit(1);
}
// Validate our results
//
printf("The result is: 0x%.8X/n", dest[0]);
// Shutdown and cleanup
//
clReleaseMemObject(memOrg);
clReleaseMemObject(memDst);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
int main (int argc, const char * argv[]) {
NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
// insert code here...
opencl_execution();
[pool drain];
return 0;
}
见主机端代码第144行:
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, (size_t[]){ 64 }, (size_t[]){ 32 }, 0, NULL, NULL);
这里,我们设定全局工作项个数为64,每个工作组有32个线程,那么这样一来就自然地被划分为两个工作组。如果我们把32改为64,这么一来就变为一个工作组,这样,在一个SM中通过一个共享变量进行通信完全OK,程序就能正常终止。
另外,如果想保持原来的2个Work Group,那么必须通过全局变量进行通信:
__kernel void solve_sum(
__global volatile unsigned buffer[512],
__global unsigned dest[512]
)
{
__local volatile int flag = 0;
size_t gid = get_global_id(0);
if(0 <= gid && gid < 32)
{
while(buffer[256] != 1);
buffer[256] = 0;
buffer[gid] = 0x1UL;
//write_mem_fence(CLK_GLOBAL_MEM_FENCE);
}
else if(32 <= gid && gid < 64)
{
buffer[256] = 1;
while(buffer[256] != 0);
unsigned ret = buffer[31 + 32 - gid];
dest[gid - 32] = ret;
}
}
这里还要注意一点。用于通信的变量都必须加上volatile,否则,OpenCL内核编译器会把对全局变量的第二次访问全都优化为直接从寄存器取数据,从而外部对此变量的改变在当前线程内将无法看见。