更多文章参考:自己动手实现darknet预测分类动态库
1.getpixel代码
static float get_pixel(image m, int x, int y, int c)
{
assert(x < m.w && y < m.h && c < m.c);
return m.data[c*m.h*m.w + y*m.w + x];
}
2.set_pixel()代码
static void set_pixel(image m, int x, int y, int c, float val)
{
if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
assert(x < m.w && y < m.h && c < m.c);
m.data[c*m.h*m.w + y*m.w + x] = val;
}
3.add_pixel()代码
static void add_pixel(image m, int x, int y, int c, float val)
{
assert(x < m.w && y < m.h && c < m.c);
m.data[c*m.h*m.w + y*m.w + x] += val;
}
4.resize_image()代码
image resize_image(image im, int w, int h)
{
image resized = make_image(w, h, im.c);
image part = make_image(w, im.h, im.c);
int r, c, k;
float w_scale = (float)(im.w - 1) / (w - 1);
float h_scale = (float)(im.h - 1) / (h - 1);
for(k = 0; k < im.c; ++k){
for(r = 0; r < im.h; ++r){
for(c = 0; c < w; ++c){
float val = 0;
if(c == w-1 || im.w == 1){
val = get_pixel(im, im.w-1, r, k);
} else {
float sx = c*w_scale;
int ix = (int) sx;
float dx = sx - ix;
val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k);
}
set_pixel(part, c, r, k, val);
}
}
}
for(k = 0; k < im.c; ++k){
for(r = 0; r < h; ++r){
float sy = r*h_scale;
int iy = (int) sy;
float dy = sy - iy;
for(c = 0; c < w; ++c){
float val = (1-dy) * get_pixel(part, c, iy, k);
set_pixel(resized, c, r, k, val);
}
if(r == h-1 || im.h == 1) continue;
for(c = 0; c < w; ++c){
float val = dy * get_pixel(part, c, iy+1, k);
add_pixel(resized, c, r, k, val);
}
}
}
free_image(part);
return resized;
}
我们分两部分进行GPU加速,
一组for循环实现一个加速函数
__global__ void MatZoomWid(float* dst,float* src,float w_scale,int realw,int realh,int c,int cfgw)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i<cfgw&&j<realh&&k<c)
{
float val = 0;
if(i == cfgw-1 || realw == 1)
{
val = getPixel(src, realw-1, j, k,realw,realh,c);
}
else
{
float sx = i*w_scale;
int ix = (int) sx;
float dx = sx - ix;
val = (1 - dx) * getPixel(src, ix, j, k,realw,realh,c) + dx * getPixel(src, ix+1, j, k,realw,realh,c);
}
set_pixel(dst, i, j, k, val,cfgw,realh,c);
}
}
第二组for循环实现加速
__global__ void MatZoomHeight(float* dst,float* src,float h_scale,int realh,int cfgh,int c,int cfgw)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i<cfgw&&j<cfgh&&k<c)
{
float sy = j*h_scale;
int iy = (int) sy;
float dy = sy - iy;
float val = (1-dy) * getPixel(src, i, iy, k,cfgw,realh,c);
set_pixel(dst, i, j, k, val,cfgw,cfgh,c);
if(j != cfgh-1 && realh != 1)
{
float val = dy * getPixel(src, i, iy+1, k,cfgw,realh,c);
add_pixel(dst, i, j, k, val,cfgw,cfgh,c);
}
}
}
其中,add_pixel,getPixel(),set_pixel代码如下:
__device__ float getPixel(float* data, int x, int y, int c,int w,int h,int realc)
{
assert(x < w && y < h && c < realc);
return data[c*h*w + y*w + x];
}
__device__ void set_pixel(float* data, int x, int y, int c, float val,int w,int h,int realc)
{
if (x < 0 || y < 0 || c < 0 || x >= w|| y >= h || c >= realc) return;
assert(x < w && y < h && c < realc);
data[c*h*w + y*w + x] = val;
}
__device__ void add_pixel(float* data, int x, int y, int c, float val,int w,int h,int realc)
{
assert(x < w && y < h && c <realc);
data[c*h*w + y*w + x] += val;
}
实现好的GPU加速函数需要在convert_to_image_gpu中调用,调用如下:
float* part;
cudaMalloc((void**)&part,cfgw*realh*c*sizeof(float));
dim3 gridSizepart((cfgw + blockSize.x - 1) / blockSize.x,(realh + blockSize.y - 1) / blockSize.y,(c + blockSize.z - 1) / blockSize.z);
float w_scale = (float)(realw - 1) / (cfgw - 1);
MatZoomWid<< < gridSizepart,blockSize >>>(part,dst,w_scale,realw,realh,c,cfgw);
float* resized;
cudaMalloc((void**)&resized,cfgw*cfgh*c*sizeof(float));
dim3 gridSizeRsized((cfgw + blockSize.x - 1) / blockSize.x,(cfgh + blockSize.y - 1) / blockSize.y,(c + blockSize.z - 1) / blockSize.z);
float h_scale = (float)(realh - 1) / (cfgh - 1);
MatZoomHeight<< < gridSizeRsized,blockSize >>>(resized,part,h_scale,realh,cfgh,c,cfgw);
convert_to_image_gpu中添加resize_image加速函数之后,代码变为:
extern "C" image convert_to_image_gpu(unsigned char* data,int realw,int realh,int c,int cfgw,int cfgh)
{
int nBytes = realw*realh*c*sizeof(unsigned char);
unsigned char* src;
cudaMalloc((void**)&src, nBytes);
float *dst;
cudaMalloc((void**)&dst, realw*realh*c*sizeof(float));
// 将host数据拷贝到device
cudaError_t status = cudaMemcpyAsync(src, data, nBytes, cudaMemcpyHostToDevice, get_cuda_stream());
CHECK_CUDA(status);
// 定义kernel的执行配置
dim3 blockSize(32,32,1);
dim3 gridSize((realw + blockSize.x - 1) / blockSize.x,(realh + blockSize.y - 1) / blockSize.y,(c + blockSize.z - 1) / blockSize.z);
MatConvertImg << < gridSize, blockSize >>>(dst,src,realw,realh,c);
float* part;
cudaMalloc((void**)&part,cfgw*realh*c*sizeof(float));
dim3 gridSizepart((cfgw + blockSize.x - 1) / blockSize.x,(realh + blockSize.y - 1) / blockSize.y,(c + blockSize.z - 1) / blockSize.z);
float w_scale = (float)(realw - 1) / (cfgw - 1);
MatZoomWid<< < gridSizepart,blockSize >>>(part,dst,w_scale,realw,realh,c,cfgw);
float* resized;
cudaMalloc((void**)&resized,cfgw*cfgh*c*sizeof(float));
dim3 gridSizeRsized((cfgw + blockSize.x - 1) / blockSize.x,(cfgh + blockSize.y - 1) / blockSize.y,(c + blockSize.z - 1) / blockSize.z);
float h_scale = (float)(realh - 1) / (cfgh - 1);
MatZoomHeight<< < gridSizeRsized,blockSize >>>(resized,part,h_scale,realh,cfgh,c,cfgw);
image im = make_image(cfgw, cfgh, c);
cuda_pull_array(resized,im.data,cfgw*cfgh*c);
cudaFree(src);
cudaFree(dst);
cudaFree(part);
cudaFree(resized);
return im;
}