使用CUDA STREAM处理一张图片

本文写了一个demo,利用cuda流概念,将处理图片的任务分为两个任务:第一个流(stream[0])处理上半张图片,第二个流(stream[1])处理下半张图片。下面先看代码:

#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <cuda.h>
#include <cuda_runtime.h>

using namespace cv;
using namespace std;

//主要的核函数
__global__ void global_RGB_to_HSV(uchar3* d_image_RGB, short3* d_image_HSV ,int height, int width){
    int R, G, B;
    int V;    //V = max{R, G, B}
    int m;      //m = min{R, G, B}
    int S;    //S = (V - min{R, G, B}) / V, if V > min{R, G, B}
                //S = 0, if R = G = B
                //delta = max{R, G, B} - min{R, G, B}
    int H;      //H = 0, if max{R, G, B} = min{R, G, B}
                //H = (60 * (G - B)) / delta, if max{R, G, B} = R
                //H = 120 + (60 * (B - R)) / delta, if max{R, G, B} = G
                //H = 240 + (60 * (R - G)) / delta, if max{R, G, B} = B
    for (int row = blockDim.y * blockIdx.y + threadIdx.y; row < height/2; row = row + gridDim.y * blockDim.y) {
        for (int col = blockDim.x * blockIdx.x + threadIdx.x; col < width; col = col + gridDim.x * blockDim.x) {
            B = d_image_RGB[row * width + col].x;
            G = d_image_RGB[row * width + col].y;
            R = d_image_RGB[row * width + col].z;
            //Confirm value V and m
            if ((B >= G) && (G >= R)) {
                V = B;
                m = R;
            } else if ((B >= G) && (G < R) && (B >= R)) {
                V = B;
                m = G;
            } else if ((B >= G) && (G < R) && (B < R)) {
                V = R;
                m = G;
            } else if ((B < G) && (G < R)) {
                V = R;
                m = B;
            } else if ((B < G) && (G >= R) && (B >= R)) {
                V = G;
                m = R;
            } else {
                V = G;
                m = B;
            }
            //Confirm value S
            if (V > m) {
                S = (int)((V - m) / V);
            } else {
                S = 0;
            }
            //Confirm value H
            if (V == m) {
                H = 0;
            } else if (V == R) {
                H = (int)(60 * (G - B) / (V - m));
            } else if (V == G) {
                H = (int)(120 + 60 * (B - R) / (V - m));
            } else {
                H = (int)(240 + 60 * (R - G) / (V - m));
            }
            //if H < 0, H should +360
            if (H < 0) {
                H = H + 360;
            }
            d_image_HSV[row * width + col].x = H;
            d_image_HSV[row * width + col].y = S;
            d_image_HSV[row * width + col].z = V;
        }
    }
}

int main()
{
    cv::Mat image_RGB = cv::imread("../../learning_cudastream/front.jpg");
    int height = image_RGB.rows;
    int width = image_RGB.cols;
    cv::Mat image_HSV(height, width, CV_16SC3);

    uchar3 *d_image_RGB_0, *d_image_RGB_1;
    //uchar3 *d_image_HSV_0, *d_image_HSV_1;
    short3 *d_image_HSV_0, *d_image_HSV_1;

	//创建两个cuda流
    cudaStream_t stream[2];
    for (int i = 0; i < 2; i = i + 1) {
        cudaStreamCreate(&stream[i]);
    }

    cudaEvent_t e_start, e_stop;
    cudaEventCreate(&e_start);
    cudaEventCreate(&e_stop);
    cudaEventRecord(e_start, 0);

    cudaMalloc(&d_image_RGB_0, height*width*sizeof(uchar3)/2);
    cudaMalloc(&d_image_RGB_1, height*width*sizeof(uchar3)/2);
    cudaMalloc(&d_image_HSV_0, height*width*sizeof(short3)/2);
    cudaMalloc(&d_image_HSV_1, height*width*sizeof(short3)/2);

    cudaMemcpyAsync(d_image_RGB_0, (uchar3*)image_RGB.data, height*width*sizeof(uchar3)/2, cudaMemcpyHostToDevice, stream[0]);
    cudaMemcpyAsync(d_image_RGB_1, (uchar3*)((uchar3*)image_RGB.data+height*width/2), height*width*sizeof(uchar3)/2, cudaMemcpyHostToDevice, stream[1]);

    dim3 blocksPerGrid(10, 10, 1);
    dim3 threadsPerBlock(32, 32, 1);

    global_RGB_to_HSV <<<blocksPerGrid, threadsPerBlock, 0, stream[0]>>> (d_image_RGB_0, d_image_HSV_0, height, width);
    global_RGB_to_HSV <<<blocksPerGrid, threadsPerBlock, 0, stream[1]>>> (d_image_RGB_1, d_image_HSV_1, height, width);

    cudaMemcpyAsync((short3*)image_HSV.data, d_image_HSV_0, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[0]);
    cudaMemcpyAsync((short3*)((short3*)image_HSV.data+height*width/2), d_image_HSV_1, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[1]);
    
    cudaStreamSynchronize(stream[0]);
    cudaStreamSynchronize(stream[1]);

    cudaEventRecord(e_stop, 0);
    cudaEventSynchronize(e_stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
    printf("Time is %3.2f ms\n", elapsedTime);

    cudaFree(d_image_RGB_0);
    cudaFree(d_image_RGB_1);
    cudaFree(d_image_HSV_0);
    cudaFree(d_image_HSV_1);

    cv::imwrite("../../learning_cudastream/front_HSV.jpg", image_HSV);
    cv::imshow("HSV", image_HSV);
    cv::waitKey(0);

    return 0;
}

简单说一下代码:代码实现的是将RGB图片转换为HSV图片

__global__ void global_RGB_to_HSV();//具体实现的内核函数
因为HSV中的H取值范围是0-360,超过255,所以用16位数来容纳。
cudaMemcpyAsync((short3*)((short3*)image_HSV.data+height*width/2), d_image_HSV_1, height*width*sizeof(short3)/2, cudaMemcpyDeviceToHost, stream[1]);//在传递的时候要注意指针需要强制类型转换成对应的类型

RGB原图:
RGB原图变为HSV图:
HSV图但实际在代码运行过后,通过imshow显示的是一张完全灰色的图片,可能是显示不了16位图片的原因,但是保存之后再打开就是这样正常的结果。
处理这样一张1280x720的图片,用时基本在10ms以内。如果不用cuda stream,用时100%在10ms以上,平均用时是在11ms。使用cuda stream确实要比不使用要快,但此处提升效果不是很明显,可能来回复制占了大部分时间。

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值