本文主要实现在C++程序中引用CUDA程序,主要意义是使顺序定义的数据能在CUDA程序中并行执行,然后返回结果。
程序主要包括main.cpp 定义main函数,需要处理的数据
Integration.cu CUDA初始化,显存分配,核函数调用,显存释放
Integration_kernel.cuh 核函数
程序输入为:Rovvy*ay|vn8
运行结果为:Hello World.
1. main.cpp
#include <stdlib.h>
#include <cutil_inline.h>
extern "C" bool RunTest(char* data, int2* data_int2, unsigned int len);
int main()
{
int len = 16;
char str[] = { 82, 111, 118, 118, 121, 42, 97, 121, 124, 118, 110, 56, 10, 10, 10, 10, '\0'};
int2 i2[16];
for (int i=0; i<len; i++)
{
i2[i].x = str[i];
i2[i].y = 10;
}
bool bTestResult;
bTestResult = RunTest(str, i2, len);
printf("%s\n", str);
for (int i=0; i<len; i++)
{
printf("%c", i2[i].x);
}
printf("\n");
return 0;
}
2. Integration.cu
#include <cutil_inline.h>
#include "Integration_kernel.cuh"
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized.\n");
return true;
}
extern "C" bool RunTest(char* data, int2* data_int2, unsigned int len)
{
if (!InitCUDA())
{
return 0;
}
const unsigned int num_threads = len / 4;
//cutilCondition(0 ==(len%4));
const unsigned int memsize = sizeof(char) * len;
const unsigned int memsize_int2 = sizeof(int2) * len;
char* d_data;
cutilSafeCall(cudaMalloc((void**)&d_data, memsize));
cutilSafeCall(cudaMemcpy(d_data, data, memsize, cudaMemcpyHostToDevice));
int2* d_data_int2;
cutilSafeCall(cudaMalloc((void**)&d_data_int2, memsize_int2));
cutilSafeCall(cudaMemcpy(d_data_int2, data_int2, memsize_int2, cudaMemcpyHostToDevice));
dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1);
dim3 threads2(len, 1, 1);
mykernel<<<grid, threads>>>((int*)d_data);
mykernel2<<<grid, threads2>>>(d_data_int2);
cutilSafeCall(cudaMemcpy(data, d_data, memsize, cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(data_int2, d_data_int2, memsize_int2, cudaMemcpyDeviceToHost));
cutilCheckMsg("Kernel execition failed!");
cutilSafeCall(cudaFree(d_data));
cutilSafeCall(cudaFree(d_data_int2));
return true;
}
3. Integeration_kernel.cuh
__global__ void mykernel(int* g_data)
{
const unsigned int tid = threadIdx.x;
int data = g_data[tid];
g_data[tid] = ((((data << 0) >> 24) - 10) << 24)
| ((((data << 8) >> 24) - 10) << 16)
| ((((data << 16) >> 24) - 10) << 8)
| ((((data << 24) >> 24) - 10) << 0);
}
__global__ void mykernel2(int2* g_data)
{
const unsigned int tid = threadIdx.x;
int2 data = g_data[tid];
g_data[tid].x = data.x - data.y;
}