使用 Numpy 和 Cuda kernel 完美复现Opencv的Sobel算子(为了高性能)

1.简单描述OpenCV的Sobel算子;

OpenCV的Sobel算子是一种常用的边缘检测算法,用于检测图像中的水平和垂直边缘。

Sobel算子通过卷积操作来计算图像中每个像素的梯度值。它使用两个3×3的卷积核,一个用于检测水平边缘,另一个用于检测垂直边缘。这两个卷积核分别为:

-1 0 1       -1 -2 -1
-2 0 2        0  0  0
-1 0 1        1  2  1

在进行卷积时,将卷积核应用于图像的每个像素,计算其与相邻像素之间的差异。这些差异将被用来计算梯度大小和方向,从而检测出图像中的边缘。

Sobel算子是一种简单但有效的边缘检测方法,常用于图像处理和计算机视觉领域中的特征提取和对象识别任务。

2. Numpy 复现代码

复现的时候需要注意的点:

  1. Sobel其实就是简单的2个卷积操作,但是,正常卷积操作为了保持原图大小不变,是应该padding的。opencv官方也是针对3*3卷积的Sobel算子的原始图像进行了padding,我们为了效果一致也进行padding,

2.1 numpy的padding操作如下:

在 NumPy 中,可以使用 numpy.pad() 函数来进行数组的 padding。它的语法如下(opencv官方的对原图采用的reflect进行的对称反射填充):

numpy.pad(array, pad_width, mode, **kwargs)

其中,array 是要进行 padding 的数组,pad_width 是一个整数或整数序列,指定每个轴向上要填充的宽度,mode 指定 padding 的方式,常见的取值有:

  • constant:用常数填充,需要指定填充值(constant_values)。
  • edge:用数组的边缘值填充。
  • reflect:用对称的方式填充,即从数组的边缘开始反射数组值。
  • wrap:用数组的另一端的值填充,即用数组的首位相连的方式填充。

2.2 Numpy完整复现代码

我们这里为了方便之后写Cuda C代码,手动实现了reflect操作

import cv2
import numpy as np

def reflect(array):
    ndim = array.shape[0]
    img_expand = np.zeros((ndim + 2, ndim + 2), dtype=np.uint8)
    ndims = img_expand.shape[0]
    start = 0
    end = ndims
    # 1. 复制
    img_expand[start + 1: end-1, start + 1: end-1] = array
                
    img_expand[0] = img_expand[2]
    img_expand[:, 0]   = img_expand[:, 2]
    img_expand[:, end - 1] = img_expand[:, end - 3]
    img_expand[end - 1] = img_expand[end - 3]
    
    
    return img_expand


def sobel(img):    
    # 定义 Sobel 算子    
    sobel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])    
    sobel_y = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]])    
    # 图像矩阵的行数和列数    
    rows, cols = img.shape    
    # 定义输出图像矩阵    
    output_x = np.zeros((rows, cols))    
    output_y = np.zeros((rows, cols))    
    output = np.zeros((rows, cols))    
    # 对图像进行边缘扩展    
#     img_expand = np.pad(img, ((1, 1), (1, 1)), 'reflect') 
    img_expand = reflect(img)
    # 对每个像素点进行卷积计算    
    for i in range(rows):        
        for j in range(cols):            
            output_x[i, j] = np.sum(sobel_x * img_expand[i:i+3, j:j+3])            
            output_y[i, j] = np.sum(sobel_y * img_expand[i:i+3, j:j+3])    
            # 计算梯度幅值    
            # output = np.sqrt(output_x ** 2 + output_y ** 2)    
    return output_x, output_y



raw_img = cv2.imread("feijidongche2_4_64730_input_gray.bmp", cv2.IMREAD_COLOR)
gray_img = cv2.cvtColor(raw_img, cv2.COLOR_BGR2GRAY)

print(f"====================== verify: sobel x =================================")
sobel_x = cv2.Sobel(gray_img, -1, 1, 0, ksize=3)
sobel_self_wrtie_x, _ = sobel(gray_img)
sobel_self_wrtie_x = np.clip(sobel_self_wrtie_x, 0, 255)
print(sobel_x)
print("="*50)
print(sobel_self_wrtie_x)
isclose = np.isclose(sobel_self_wrtie_x, sobel_x, rtol=1e-9, atol=1e-9)
print("="*50)
print(sobel_self_wrtie_x[isclose==False])


print(f"====================== verify: sobel y =================================")
sobel_y = cv2.Sobel(gray_img, -1, 0, 1, ksize=3)
_, sobel_self_wrtie_y = sobel(gray_img)
sobel_self_wrtie_y = np.clip(sobel_self_wrtie_y, 0, 255)
print(sobel_y)
print("="*50)
print(sobel_self_wrtie_y)
isclose = np.isclose(sobel_self_wrtie_x, sobel_x, rtol=1e-9, atol=1e-9)
print("="*50)
print(sobel_self_wrtie_x[isclose==False])

代码执行结果:

====================== verify: sobel x =================================
[[ 0  0 12 ...  8  0  0]
 [ 0  0 12 ...  9  0  0]
 [ 0 24 21 ...  5  0  0]
 ...
 [ 0  0 52 ...  0  0  0]
 [ 0  0  0 ... 11  0  0]
 [ 0  0  0 ... 26  0  0]]
==================================================
[[ 0.  0. 12. ...  8.  0.  0.]
 [ 0.  0. 12. ...  9.  0.  0.]
 [ 0. 24. 21. ...  5.  0.  0.]
 ...
 [ 0.  0. 52. ...  0.  0.  0.]
 [ 0.  0.  0. ... 11.  0.  0.]
 [ 0.  0.  0. ... 26.  0.  0.]]
==================================================
[]
====================== verify: sobel y =================================
[[ 0  0  0 ...  0  0  0]
 [ 0  0  6 ...  0  0  0]
 [ 0  0  0 ...  0  3 12]
 ...
 [84 67 18 ...  0  0  0]
 [72 60 23 ... 15 49 68]
 [ 0  0  0 ...  0  0  0]]
==================================================
[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  6. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  3. 12.]
 ...
 [84. 67. 18. ...  0.  0.  0.]
 [72. 60. 23. ... 15. 49. 68.]
 [ 0.  0.  0. ...  0.  0.  0.]]
==================================================
[]  # 最后结果为空,证明我们复现的sobel和opencv的sobel算子效果一模一样

3.Cuda代码今日稍后追加;

3.1 kernel 代码:

#include "cuda_runtime.h"


typedef unsigned char uint8_t;



#define checkCudaKernel(...)                                                                         \
    __VA_ARGS__;                                                                                     \
    do{cudaError_t cudaStatus = cudaPeekAtLastError();                                               \
    if (cudaStatus != cudaSuccess){                                                                  \
        printf("launch failed: %s\n", cudaGetErrorString(cudaStatus));                                  \
    }} while(0);


static __device__ uint8_t cast(float value){
    return value < 0 ? 0 : (value > 255 ? 255 : value);
}


static __global__ void sobelKernel(const unsigned char* imgIn, uint8_t* imgOut, const int width, const int height, const int channels, const bool xDirection)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height)
        return;

    if (x < width && y < height)
    {
        int sumX = 0, sumY = 0;
        const int gx[3][3] = { {-1, 0, 1}, {-2, 0, 2}, {-1, 0, 1} };
        const int gy[3][3] = { {-1, -2, -1}, {0, 0, 0}, {1, 2, 1} };

        int xEdge, yEdge;
        for (int k = 0; k < channels; ++k)
        {
            for (int j = -1; j <= 1; ++j)
            {
                for (int i = -1; i <= 1; ++i)
                {    
                    xEdge = i;
                    yEdge = j;

                    if (y == 0 && j == -1)
                    {
                        yEdge = 1;
                    }

                    if (x == 0 && i == -1)
                    {
                        xEdge = 1;
                    }

                    if (y == height - 1 && j == 1)
                    {
                        yEdge = -1;
                    }

                    if (x == width - 1 && i == 1)
                    {
                        xEdge = -1;
                    }

                    int idx = (y + yEdge) * width * channels + (x + xEdge) * channels + k;
                    if (x >= 0 && x < width && y >= 0 && y < height)
                    {
                        if (xDirection)
                        {
                            sumX += gx[j + 1][i + 1] * imgIn[idx];
                        }
                        else
                        {
                            sumY += gy[j + 1][i + 1] * imgIn[idx];
                        }
                    }
                }
            }
        }

        int idx = y * width * channels + x * channels;
        if (xDirection)
        {
            imgOut[idx] = cast(sumX);
        }
        else
        {
            imgOut[idx] = cast(sumY);
        }
    }

    return;
}


/**
 * \brief: Sobel算子的gpu实现
 * \param: src 输入的灰度图像的gpu数据
 * \param: src_width 输入的灰度图像的宽度
 * \param: src_height 输入的灰度图像的高度
 * \param: dst 提取的边缘的信息, 需要传入一个与输入大小相同的gpu的空间的首地址
 * \param: xDirection 是否选择 x 反向的, 如果选择X方向, 则返回的dst也为X方向的边缘信息
 * \param: stream cuda任务流
 * 
 * \retval: None
 */ 
void Sobel(
    uint8_t* src, int src_width, int src_height, 
    uint8_t* dst, bool xDirection,
    cudaStream_t stream)
{   
    // 32 * 32 = 1024; Pascal 架构和 Turing 架构的显卡一般最多为1个block能启动1024个thread; 
    // 多了则会报错, 核函数会启动失败
    dim3 threadPerBlock(32, 32);    
    dim3 blockPerGrid((src_width + threadPerBlock.x - 1) / threadPerBlock.x, 
        (src_height + threadPerBlock.y - 1) / threadPerBlock.y);

    checkCudaKernel(sobelKernel << <blockPerGrid, threadPerBlock, 0, stream >> > (
        src, dst, src_width, src_height, 1, xDirection
    ));
}

3.2 C++调用 kernel 代码,并且验证效果代码:

#include <memory>
#include <iostream>
#include "opencv2/opencv.hpp"
#include <cassert>
#include "tools/ilogger.hpp"
#include "cuda_runtime.h"

#define Debug 0

#define checkCudaRuntime(call) check_runtime(call, #call, __LINE__, __FILE__)

bool check_runtime(cudaError_t e, const char* call, int line, const char *file){
    if (e != cudaSuccess) {
        printf("CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d\n", call, cudaGetErrorString(e), cudaGetErrorName(e), e, file, line);
        return false;
    }
    return true;
}

/**
 * \brief: Sobel算子的gpu实现
 * \param: src 输入的灰度图像的gpu数据
 * \param: src_width 输入的灰度图像的宽度
 * \param: src_height 输入的灰度图像的高度
 * \param: dst 提取的边缘的信息, 需要传入一个与输入大小相同的gpu的空间的首地址
 * \param: xDirection 是否选择 x 反向的, 如果选择X方向, 则返回的dst也为X方向的边缘信息
 * \param: stream cuda任务流
 * 
 * \retval: None
 */ 
void Sobel(
    uint8_t* src, int src_width, int src_height, 
    uint8_t* dst, bool xDirection,
    cudaStream_t stream);

void write_sobel_result(cv::Mat mat, std::string file_name)
{
    std::ofstream ofs(file_name);
    ofs.setf(std::ios::fixed, std::ios::floatfield);  // 设定为 fixed 模式,以小数点表示浮点数
    ofs.precision(3);  // 设置精度 2
    for (int i = 0; i < mat.rows; i++)
    {
        for (int j = 0; j < mat.cols; j++)
        {
            ofs << (int)mat.at<u_char>(i, j) << " ";
        }

        ofs << std::endl;
    }

    ofs.close();
}

int main()
{   
    cv::Mat img = cv::imread("workspace/1.jpg", cv::IMREAD_COLOR);
    assert(!img.empty());

    cv::Mat grayMat;
    cv::cvtColor(img, grayMat, cv::COLOR_BGR2GRAY);
    write_sobel_result(grayMat, "workspace/grayMat_1.txt");

    // 1. Malloc -> Copy -> rgb
    uint8_t* pbyImgBufferDev = nullptr;  // 200 * 200 * 3
    checkCudaRuntime(cudaMalloc(&pbyImgBufferDev, img.rows * img.cols * 3 * sizeof(uint8_t)));
    checkCudaRuntime(cudaMemcpy(pbyImgBufferDev, img.data, img.rows * img.cols * 3 * sizeof(uint8_t), cudaMemcpyHostToDevice));

    // BGR2GRAY test  pass
    uint8_t* pbyImgBufferGrayDev = nullptr;
    uint8_t* pbyImgBufferGrayHost = nullptr;
    pbyImgBufferGrayHost = new uint8_t[img.rows * img.cols];  // 200*200
    checkCudaRuntime(cudaMalloc(&pbyImgBufferGrayDev, img.rows * img.cols * sizeof(uint8_t)));
    //                                               linestep     width     height
    rgb_to_gray(pbyImgBufferDev, img.step.p[0], img.cols, img.rows, pbyImgBufferGrayDev, nullptr);
    checkCudaRuntime(cudaDeviceSynchronize());
    checkCudaRuntime(cudaMemcpy(pbyImgBufferGrayHost, pbyImgBufferGrayDev, img.rows * img.cols * sizeof(uint8_t), cudaMemcpyDeviceToHost));

    cv::Mat img_gray(img.rows, img.cols, CV_8UC1, pbyImgBufferGrayHost);
    #if Debug
    write_sobel_result(img_gray, "workspace/grayMat_K1.txt");
    cv::imwrite("workspace/1.bmp", img_gray);
    #endif

    // 保存opencv 的sobel 算子结果
#if 0
    cv::Mat Sobel_x, Sobel_y;
    cv::Sobel(img_gray, Sobel_x, -1, 1, 0, 3);
    std::cout << Sobel_x.type() << std::endl;
    write_sobel_result(Sobel_x, "Sobel_x_1.txt");
    // cv::imwrite("workspace/1_sobel_x.bmp", Sobel_x);

    cv::Sobel(img_gray, Sobel_y, -1, 0, 1, 3);
    write_sobel_result(Sobel_y, "Sobel_y_1.txt");
    // cv::Mat src;
    exit(0);
#endif
    
    typedef uint8_t test_type;
    
    test_type* pbySobelX = nullptr;
    checkCudaRuntime(cudaMalloc(&pbySobelX, img.rows * img.cols * sizeof(test_type)));
    checkCudaRuntime(cudaMemset(pbySobelX, 0, img.rows * img.cols * sizeof(test_type)));

    test_type* pbySobelY = nullptr;
    checkCudaRuntime(cudaMalloc(&pbySobelY, img.rows * img.cols * sizeof(test_type)));
    checkCudaRuntime(cudaMemset(pbySobelY, 0, img.rows * img.cols * sizeof(test_type)));

    // 测试Sobel 算子的性能
    while (0)
    {   
        auto t0 = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < 45; ++i)
        {   
            Sobel(pbyImgBufferGrayDev, img.size[1], img.size[0], pbySobelX, true, nullptr);
            Sobel(pbyImgBufferGrayDev, img.size[1], img.size[0], pbySobelY, false, nullptr);
        }
        checkCudaRuntime(cudaDeviceSynchronize());
        auto t1 = std::chrono::high_resolution_clock::now();
        INFO("Sobel time: %f ms", std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000.0);
    }

    auto t0 = std::chrono::high_resolution_clock::now();
    Sobel(pbyImgBufferGrayDev, img.size[1], img.size[0], pbySobelX, true, nullptr);
    Sobel(pbyImgBufferGrayDev, img.size[1], img.size[0], pbySobelY, false, nullptr);
    auto t1 = std::chrono::high_resolution_clock::now();
    checkCudaRuntime(cudaDeviceSynchronize());
    INFO("Sobel time: %f ms", std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000.0);

    // 保存Nvidai Kernel 的sobel 算子结果
    cv::Mat SobeX_mat(img.rows, img.cols, CV_8U);
    cv::Mat SobeY_mat(img.rows, img.cols, CV_8U);
    checkCudaRuntime(cudaMemcpy(SobeX_mat.data, pbySobelX, img.rows * img.cols * sizeof(test_type), cudaMemcpyDeviceToHost));
    checkCudaRuntime(cudaMemcpy(SobeY_mat.data, pbySobelY, img.rows * img.cols * sizeof(test_type), cudaMemcpyDeviceToHost));
    write_sobel_result(SobeX_mat, "Sobel_dev_x.txt");
    write_sobel_result(SobeY_mat, "Sobel_dev_y.txt");
    checkCudaRuntime(cudaFree(pbyImgBufferDev));


    checkCudaRuntime(cudaFree(pbyImgBufferGrayDev));
    delete [] pbyImgBufferGrayHost;
    return 0;
}


PS:

如果你觉得我的代码有帮助到你,请关注一波,非常感谢。
使用本教程遇到问题无法解决的话,可以在私信联系我或者在本文章下方进行评论(一般24小时内回复),我会尽我所能帮助你解决问题。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值