本文写了一个demo,利用cuda流概念,将处理图片的任务分为两个任务:第一个流(stream[0])处理上半张图片,第二个流(stream[1])处理下半张图片。下面先看代码:
#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
using namespace cv;
using namespace std;
//主要的核函数
__global__ void global_RGB_to_HSV(uchar3* d_image_RGB, short3* d_image_HSV ,int height, int width){
int R, G, B;
int V; //V = max{R, G, B}
int m; //m = min{R, G, B}
int S; //S = (V - min{R, G, B}) / V, if V > min{R, G, B}
//S = 0, if R = G = B
//delta = max{R, G, B} - min{R, G, B}
int H; //H = 0, if max{R, G, B} = min{R, G, B}
//H = (60 * (G - B)) / delta, if max{R, G, B} = R
//H = 120 + (60 * (B - R)) / delta, if max{R, G, B} = G
//H = 240 + (60 * (R - G)) / delta, if max{R, G, B} = B
for (int row = blockDim.y * blockIdx.y + threadIdx.y; row < height/2; row = row + gridDim.y * blockDim.y) {
for (int col = blockDim.x * blockIdx.x + threadIdx.x; col < width; col = col + gridDim.x * blockDim.x) {
B = d_image_RGB[row * width + col].x;
G = d_image_RGB[row * width + col].y;
R = d_image_RGB[row * width + col].z;
//Confirm value V and m
if ((B >= G) && (G >= R)) {
V = B;
m = R;
} else if ((B >= G) && (G < R) && (B >= R)) {
V = B;
m = G;
} else if ((B >= G) && (G < R) && (B < R)) {
V = R;
m = G;
} else if ((B < G) && (G < R)) {
V = R;
m = B;
} else if ((B < G) && (G >= R) && (B >= R)) {
V = G;
m = R;
} else {
V = G;
m = B;
}
//Confirm value S
if (V > m) {
S = (int)((V - m) / V);
} else {
S = 0;
}
//Confirm value H
if (V == m) {
H = 0;
} else if (V == R) {
H = (int)(60 * (G - B) / (V - m));
} else if (V == G) {
H = (int)(120 + 60 * (B - R) / (V - m));
} else {
H = (int)(240 + 60 * (R - G) / (V - m));
}
//if H < 0, H should +360
if (H < 0) {
H = H + 360;
}
d_image_HSV[row * width + col].x = H;
d_image_HSV[row * width + col].y = S;
d_image_HSV[row * width + col].z = V;
}
}
}
int main()
{
cv::Mat image_RGB = cv::imread("../../learning_cudastream/front.jpg");
int height = image_RGB.rows;
int width = image_RGB.cols;
cv::Mat image_HSV(height, width, CV_16SC3);
uchar3 *d_image_RGB_0, *d_image_RGB_1;
//uchar3 *d_image_HSV_0, *d_image_HSV_1;
short3 *d_image_HSV_0, *d_image_HSV_1;
//创建两个cuda流
cudaStream_t stream[2];
for (int i = 0; i < 2; i = i + 1) {
cudaStreamCreate(&stream[i]);
}
cudaEvent_t e_start, e_stop;
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
cudaEventRecord(e_start, 0);
cudaMalloc(&d_image_RGB_0, height*width*sizeof(uchar3)/2);
cudaMalloc(&d_image_RGB_1, height*width*sizeof(uchar3)/2);
cudaMalloc(&d_image_HSV_0, height*width*sizeof(short3)/2);
cudaMalloc(&d_image_HSV_1, height*width*sizeof(short3)/2);
cudaMemcpyAsync(d_image_RGB_0, (uchar3*)image_RGB.data, height*width*sizeof(uchar3)/2, cudaMemcpyHostToDevice, stream[0]);
cudaMemcpyAsync(d_image_RGB_1, (uchar3*)((uchar3*)image_RGB.data+height*width/2), height*width*sizeof(uchar3)/2, cudaMemcpyHostToDevice, stream[1]);
dim3 blocksPerGrid(10, 10, 1);
dim3 threadsPerBlock(32, 32, 1);
global_RGB_to_HSV <<<blocksPerGrid, threadsPerBlock, 0, stream[0]>>> (d_image_RGB_0, d_image_HSV_0, height, width);
global_RGB_to_HSV <<<blocksPerGrid, threadsPerBlock, 0, stream[1]>>> (d_image_RGB_1, d_image_HSV_1, height, width);
cudaMemcpyAsync((short3*)image_HSV.data, d_image_HSV_0, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[0]);
cudaMemcpyAsync((short3*)((short3*)image_HSV.data+height*width/2), d_image_HSV_1, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[1]);
cudaStreamSynchronize(stream[0]);
cudaStreamSynchronize(stream[1]);
cudaEventRecord(e_stop, 0);
cudaEventSynchronize(e_stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("Time is %3.2f ms\n", elapsedTime);
cudaFree(d_image_RGB_0);
cudaFree(d_image_RGB_1);
cudaFree(d_image_HSV_0);
cudaFree(d_image_HSV_1);
cv::imwrite("../../learning_cudastream/front_HSV.jpg", image_HSV);
cv::imshow("HSV", image_HSV);
cv::waitKey(0);
return 0;
}
简单说一下代码:代码实现的是将RGB图片转换为HSV图片
__global__ void global_RGB_to_HSV();//具体实现的内核函数
因为HSV中的H取值范围是0-360,超过255,所以用16位数来容纳。
cudaMemcpyAsync((short3*)((short3*)image_HSV.data+height*width/2), d_image_HSV_1, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[1]);//在传递的时候要注意指针需要强制类型转换成对应的类型
RGB原图:
变为HSV图:
但实际在代码运行过后,通过imshow显示的是一张完全灰色的图片,可能是显示不了16位图片的原因,但是保存之后再打开就是这样正常的结果。
处理这样一张1280x720的图片,用时基本在10ms以内。如果不用cuda stream,用时100%在10ms以上,平均用时是在11ms。使用cuda stream确实要比不使用要快,但此处提升效果不是很明显,可能来回复制占了大部分时间。