使用 CUDA GPU 实现 Sobel 边缘 轮廓提取
#include <iostream>
#include <cmath>
#include <cuda_runtime.h>
#include <cudnn.h>
#include <cuda.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <opencv.hpp>
__global__ void Sobel_gpu(unsigned char* in, unsigned char* out, int imgHeigeht, int imgWidth)
{
int IdxX = threadIdx.x + blockDim.x * blockIdx.x;
int IdxY = threadIdx.y + blockDim.y * blockIdx.y;
int Idx = IdxY * imgWidth + IdxX;
int Gx = 0;
int Gy = 0;
unsigned char x0, x1, x2, x3, x4, x5, x6, x7, x8;
if (Idx > 0 && Idx < imgHeigeht && IdxY > 0 && IdxY < imgHeigeht)
{
x0 = in[(IdxY - 1) * imgWidth + IdxX - 1];
x1 = in[(IdxY - 1) * imgWidth + IdxX];
x2 = in[(IdxY - 1) * imgWidth + IdxX + 1];
x3 = in[IdxY * imgWidth + IdxX - 1];
x4 = in[IdxY * imgWidth + IdxX];
x5 = in[IdxY * imgWidth + IdxX + 1];
x6 = in[(IdxY + 1) * imgWidth + IdxX - 1];
x7 = in[(IdxY + 1) * imgWidth + IdxX - 1];
x8 = in[(IdxY + 1) * imgWidth + IdxX - 1];
Gx = (x0 + x3 * 2 + x6) - (x2 + x5 * 2 + x8);
Gy = (x0 - x1 * 2 + x2) - (x6 + x7 * 2 + x8);
out[Idx] = (abs(Gx) + abs(Gy)) / 2;
}
}
__global__ void sobelFilter(const unsigned char* input, unsigned char* output, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < 1 || x >= width - 1 || y < 1 || y >= height - 1) {
return;
}
int Gx = -1 * input[(y - 1) * width + (x - 1)] - 2 * input[y * width + (x - 1)] - input[(y + 1) * width + (x - 1)]
+ input[(y - 1) * width + (x + 1)] + 2 * input[y * width + (x + 1)] + input[(y + 1) * width + (x + 1)];
int Gy = -1 * input[(y - 1) * width + (x - 1)] - 2 * input[(y - 1) * width + x] - input[(y - 1) * width + (x + 1)]
+ input[(y + 1) * width + (x - 1)] + 2 * input[(y + 1) * width + x] + input[(y + 1) * width + (x + 1)];
output[y * width + x] = (abs(Gx) + abs(Gy)) / 2;
}
int main()
{
cv::Mat img = cv::imread("./lena.jpg", 0);
int imgHeight = img.rows;
int imgWidth = img.cols;
cv::Mat dst_gpu = cv::Mat(imgHeight, imgWidth, CV_8UC1, cv::Scalar(0));
size_t N = imgHeight * imgWidth * sizeof(unsigned char);
unsigned char* in_gpu;
unsigned char* out_gpu;
cudaMalloc((void**)&in_gpu, N);
cudaMalloc((void**)&out_gpu, N);
dim3 threadPerBlock(32, 32, 1);
dim3 threadPerGrid((imgWidth + threadPerBlock.x - 1) / threadPerBlock.x,
(imgHeight + threadPerBlock.y - 1) / threadPerBlock.y);
cudaMemcpy(in_gpu, img.data, N, cudaMemcpyHostToDevice);
sobelFilter <<<threadPerBlock, threadPerGrid>>>(in_gpu, out_gpu, imgHeight, imgWidth);
cudaMemcpy(dst_gpu.data, out_gpu, N, cudaMemcpyDeviceToHost);
cv::imshow("dst_gpu", dst_gpu);
cv::waitKey(0);
cudaFree(in_gpu);
cudaFree(out_gpu);
return 0;
}