标签: CUDAExample
例程说明
例程使用NVIDIA库vector_types.h中的int2结构体,同时利用补齐数组避免band conflict,但是在GTX980系列中一个band有32个线程,而又半个band的说法,当补齐数组应补齐到32的倍数,而不是16的倍数,此例程针对前期的硬件,现在的硬件不适应,但思想是通用的,同时在数据量小的情况下,band conflict不是影响效率的关键,当然,避免band conflict也是很重要的。
字符测试
当字符型可以用字符,也可以用ascii码,下面的测试程序说明此问题
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
int main()
{
char a1 = 'H';
char a2 = 72;
std::cout << "a1 = "<< a1 << "," << "a2 = " << a2 << std::endl;
return 0;
}
字符初始化Hello world
char str[] = { 82, 111, 118, 118, 121, 42, 97, 121, 124, 118, 110, 56,10, 10, 10, 10};
利用库vector_types.h初始化字符数组,x保存字符,y保存偏移量
// Use int2 showing that CUDA vector types can be used in cpp code
int2 i2[16];
for (int i = 0; i < len; i++)
{
i2[i].x = str[i];
i2[i].y = 10;
}
runTest函数
此函数完成GPU函数调用,C函数调用,结果对比。其中GPU函数采用两种方式实现,C函数采用两种方式实现,最后进行两种实现函数结果的对比。
////////////////////////////////////////////////////////////////////////////////
//! Entry point for Cuda functionality on host side
//! @param argc command line argument count
//! @param argv command line arguments
//! @param data data to process on the device
//! @param len len of \a data
////////////////////////////////////////////////////////////////////////////////
extern "C" bool
runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len)
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
findCudaDevice(argc, (const char **)argv);
const unsigned int num_threads = len / 4;
assert(0 == (len % 4));
const unsigned int mem_size = sizeof(char) * len;
printf("sizeof(char) = %d\n", sizeof(char));
const unsigned int mem_size_int2 = sizeof(int2) * len;
printf("sizeof(int2) = %d\n", sizeof(int2));
// allocate device memory
char *d_data;
checkCudaErrors(cudaMalloc((void **) &d_data, mem_size));
// copy host memory to device
checkCudaErrors(cudaMemcpy(d_data, data, mem_size,
cudaMemcpyHostToDevice));
// allocate device memory for int2 version
int2 *d_data_int2;
checkCudaErrors(cudaMalloc((void **) &d_data_int2, mem_size_int2));
// copy host memory to device
checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2,
cudaMemcpyHostToDevice));
// setup execution parameters
dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1);
dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version
// execute the kernel
kernel<<< grid, threads >>>((int *) d_data);
kernel2<<< grid, threads2 >>>(d_data_int2);
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
// compute reference solutions
char *reference = (char *) malloc(mem_size);
computeGold(reference, data, len);
int2 *reference2 = (int2 *) malloc(mem_size_int2);
computeGold2(reference2, data_int2, len);
// copy results from device to host
checkCudaErrors(cudaMemcpy(data, d_data, mem_size,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2,
cudaMemcpyDeviceToHost));
// check result
bool success = true;
for (unsigned int i = 0; i < len; i++)
{
if (reference[i] != data[i] ||
reference2[i].x != data_int2[i].x ||
reference2[i].y != data_int2[i].y)
{
success = false;
}
}
// cleanup memory
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFree(d_data_int2));
free(reference);
free(reference2);
return success;
}
GPU 函数实现
///////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_odata memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(int *g_data)
{
// write data to global memory
const unsigned int tid = threadIdx.x;
int data = g_data[tid];
// use integer arithmetic to process all four bytes with one thread
// this serializes the execution, but is the simplest solutions to avoid
// bank conflicts for this very low number of threads
// in general it is more efficient to process each byte by a separate thread,
// to avoid bank conflicts the access pattern should be
// g_data[4 * wtid + wid], where wtid is the thread id within the half warp
// and wid is the warp id
// see also the programming guide for a more in depth discussion.
g_data[tid] = ((((data << 0) >> 24) - 10) << 24)
| ((((data << 8) >> 24) - 10) << 16)
| ((((data << 16) >> 24) - 10) << 8)
| ((((data << 24) >> 24) - 10) << 0);
}
///////////////////////////////////////////////////////////////////////////////
//! Demonstration that int2 data can be used in the cpp code
//! @param g_odata memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void
kernel2(int2 *g_data)
{
// write data to global memory
const unsigned int tid = threadIdx.x;
int2 data = g_data[tid];
// use integer arithmetic to process all four bytes with one thread
// this serializes the execution, but is the simplest solutions to avoid
// bank conflicts for this very low number of threads
// in general it is more efficient to process each byte by a separate thread,
// to avoid bank conflicts the access pattern should be
// g_data[4 * wtid + wid], where wtid is the thread id within the half warp
// and wid is the warp id
// see also the programming guide for a more in depth discussion.
g_data[tid].x = data.x - data.y;
}
C函数实现
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! Each element is multiplied with the number of threads / array length
//! @param reference reference data, computed but preallocated
//! @param idata input data as provided to device
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
void
computeGold(char *reference, char *idata, const unsigned int len)
{
for (unsigned int i = 0; i < len; ++i)
reference[i] = idata[i] - 10;
}
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set for int2 version
//! Each element is multiplied with the number of threads / array length
//! @param reference reference data, computed but preallocated
//! @param idata input data as provided to device
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
void
computeGold2(int2 *reference, int2 *idata, const unsigned int len)
{
for (unsigned int i = 0; i < len; ++i)
{
reference[i].x = idata[i].x - idata[i].y;
reference[i].y = idata[i].y;
}
}
输出结果
GPU Device 0: “GeForce GTX 980” with compute capability 5.2
sizeof(char) = 1
sizeof(int2) = 8
Hello World.
Hello World.
请按任意键继续…
分析
在此输入正文