cuda学习笔记（6） windows opencv安装及使用，常见图像处理操作的GPU实现

猪猪虾的业余生活

于 2024-08-15 19:36:55 发布

阅读量492

点赞数 9

分类专栏： cuda 文章标签：学习笔记 windows

本文链接：https://blog.csdn.net/weixin_43528481/article/details/140360404

版权

cuda 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

1 openCV 简介及安装 + windows10 系统

在这里插入图片描述

opencv官网下载opencv包，下载的时候经尽量不要选择带*版本的，这些版本还在持续更新中，然后直接双击按照提示进行安装即可，记住安装的位置目录
环境变量-> Path->新建->将opencv安装目录下的bin文件所在路径复制到新建的环境变量里面，也就是对应的第4步，我的安装路径如下：F:\Opencv\opencv\build\x64\vc15\bin -》应用（选vc14还是15和你安装的版本有关系）
配置VS环境-》项目属性》
包含目录：将两个路径添加进去F:\Opencv\opencv\build\include\opencv2 和 F:\Opencv\opencv\build\include
库目录：在下面路径里面F:\Opencv\opencv\build\x64\vc15\lib添加到库目录里面

在这里插入图片描述
点黄色的地方就可以进行编辑

C/C++ ->链接期->t附加依赖项目-》将目录F:\Opencv\opencv\build\x64\vc15\lib下I的opencv_world410d,lib添加进入环境

2 openCV使用

在这里插入图片描述

3 图像灰度化和边缘提取 cpu实现呵GPU实现


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include<vector>

using namespace std;
using namespace cv;

#define TILE 16
const int KERNEL_RADIUS = 1;	// 卷积核的半径



//stride_input输入图像的步长，stride_output：输出图像的步长
// 卷积核写入GPU常量内存中
__constant__ int KERNEL[2 * KERNEL_RADIUS + 1][2 * KERNEL_RADIUS + 1] =
{ 1, 1, 1,
1, -8, 1,
1, 1, 1 };

__global__ void edge_kernel(unsigned char* input, unsigned char* output, int width, int height, int stride_input, int stride_output)
{
/*	每个像素对应一个线程，获取对应线程的x,y坐标，映射到图像上刚好对应图像的索引
		每个像素对应一个线程，获取对应线程的x,y坐标，映射到图像上刚好对应图像的索引*/
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;
	int maskSize = 3;

	int channel = stride_input / width; //如果等于1表示是灰度图，如果等于3是RGB图
	float temp = 0;
	if (x > 0 && x < width-1 && y > 0 && y < height-1 && channel == 1)
	{
		for (int i = 0; i < maskSize; i++) {
			for (int j = 0; j < maskSize; j++) {
				temp += input[(y+j)* width + x + i] * KERNEL[j][i];
			}
		}
		output[y * stride_output + x] = temp;
	}
}

int main()
{
	Mat frame = imread("I:\\ONE_grade\\SZ_code\\class\\CudaRuntime2\\bg.jpg");
	cout << frame.cols << "  " << frame.rows << "  " << frame.step << ",static_cast<int>(frame.step) = " << static_cast<int>(frame.step) << endl;

	Mat grey, edge_gpu(frame.rows, frame.cols, CV_8UC1); //8bite, 灰度图是1通道
	Mat Edge_pic;

	//**********************************  CPU掩膜实现边缘提取  **************************** 
	//灰度化
	cvtColor(frame, grey, COLOR_BGR2GRAY);

	clock_t time_start, time_end;
	time_start = clock();
	//边缘提取
	Mat mask = (Mat_<char>(3, 3) << -1, -1, -1, -1, 8, -1, -1, -1, -1);
	filter2D(grey, Edge_pic, grey.depth(), mask);
	time_end = clock();
	cout << " the cpu time = " << time_end - time_start << endl;

	//**************************** GPU掩膜实现边缘提取   **************************** 
		//分配GPU空间
	unsigned char* input, * output;
	cudaMalloc((void**)&input, static_cast<int>(grey.step) * grey.rows * sizeof(unsigned char));
	cudaMalloc((void**)&output, static_cast<int>(edge_gpu.step) * edge_gpu.rows * sizeof(unsigned char));

	//将数据从cpu拷贝到GPU
	cudaMemcpy(input, grey.data, static_cast<int>(grey.step) * grey.rows * sizeof(unsigned char), cudaMemcpyHostToDevice);

	time_start = clock();
	//gpu上处理数据
	dim3 blockdim(TILE, TILE);
	//下面是一种设置技巧，能刚好设置一个线程处理一个像素，尽可能减少浪费
	dim3 griddim((edge_gpu.cols + TILE - 1) / TILE, (edge_gpu.rows + TILE - 1) / TILE);
	edge_kernel << <griddim, blockdim >> > (input, output, grey.cols, grey.rows, static_cast<int>(grey.step), static_cast<int>(edge_gpu.step));

	//将GPU上处理完毕的数据从GPU拷贝到CPU
	cudaMemcpy(edge_gpu.data, output, static_cast<int>(edge_gpu.step) * edge_gpu.rows * sizeof(unsigned char), cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	time_end = clock();
	cout << " the GPU time = " << time_end - time_start << endl;

	namedWindow("grey", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("grey", grey);

	namedWindow("cpu result", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("cpu result", Edge_pic);

	namedWindow("edge_gpu", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("edge_gpu", edge_gpu);

	//如果想要实现按到ESC才退出，
	while (true)
	{
		int key = waitKey();
		if (key == 27)
			break;
	}
	return 0;
}

在这里插入图片描述

灰度图和BGR图在openCVL里面的存储方式如下，BGR是连续3个点表示一个数据点的值，3个点分别是BGR
灰度图

在这里插入图片描述

RGB
一些特殊情况，补的哪些位置都是不记录任何信息的，他可以是任意值，但不会影响图像

4 图像直方图均衡化和图像增强

在这里插入图片描述

5 BGR图转灰度图

（1）常用的openCV处理图像的命令

在这里插入图片描述

获取灰度图某点的像素值示例

在这里插入图片描述

（2）BGR图转灰度图,对比openCV自带库函数和GPU转换之间的速度

在这里插入图片描述


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include<vector>

using namespace std;
using namespace cv;

#define TILE 16

//stride_input输入图像的步长，stride_output：输出图像的步长
__global__ void bgr2grey_kernel(unsigned char *input, unsigned char *output, int width, int height, int stride_input, int stride_output)
{
	//每个像素对应一个线程，获取对应线程的x,y坐标，映射到图像上刚好对应图像的索引
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;

	int channel = stride_input / width; //如果等于1表示是灰度图，如果等于3是RGB图
	if(x < width && y < height)
	{
		if (channel == 1)   output[y * width + x] = input[y * width + x];
		if (channel == 3)   output[y * stride_output + x] = input[y * stride_input + x*3+0] * 0.114 +
			                                        input[y * stride_input + x * 3 + 1] * 0.587 +
													input[y * stride_input + x * 3 + 2] * 0.299;
	}

}

int main()
{
	Mat frame = imread("I:\\ONE_grade\\SZ_code\\class\\CudaRuntime2\\bg.jpg");
	cout << frame.cols << "  " << frame.rows << "  " << frame.step << ",static_cast<int>(frame.step) = " << static_cast<int>(frame.step)<<endl;

//---------------------------------------------------
	clock_t time_start, time_end;
	time_start =  clock();
	Mat grey, grey_gpu(frame.rows, frame.cols, CV_8UC1); //8bite, 灰度图是1通道
	cvtColor(frame, grey, COLOR_BGR2GRAY);   //灰度化
	time_end = clock();
	cout << " the cpu time = " << time_end - time_start << endl;

//---------------------------------------------------
	//分配GPU空间
	unsigned char* input, * output;
	cudaMalloc((void**)&input, static_cast<int>(frame.step) * frame.rows * sizeof(unsigned char));
	cudaMalloc((void**)&output, static_cast<int>(grey_gpu.step) * grey_gpu.rows * sizeof(unsigned char));

	//将数据从cpu拷贝到GPU
	cudaMemcpy(input, frame.data, static_cast<int>(frame.step) * frame.rows * sizeof(unsigned char), cudaMemcpyHostToDevice);

	time_start = clock();
	//gpu上处理数据
	dim3 blockdim(TILE, TILE);
	//下面是一种设置技巧，能刚好设置一个线程处理一个像素，尽可能减少浪费
	dim3 griddim((grey_gpu.cols + TILE - 1) / TILE, (grey_gpu.rows + TILE - 1) / TILE);
	bgr2grey_kernel << <griddim, blockdim >> > (input, output, frame.cols, frame.rows, static_cast<int>(frame.step), static_cast<int>(grey_gpu.step));

	//将GPU上处理完毕的数据从GPU拷贝到CPU
	cudaMemcpy( grey_gpu.data, output, static_cast<int>(grey_gpu.step) * grey_gpu.rows * sizeof(unsigned char), cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	time_end = clock();
	cout << " the GPU time = " << time_end - time_start << endl;

	namedWindow("grey", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("grey", grey);

	namedWindow("grey_gpu", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("grey_gpu", grey_gpu);

	//如果想要实现按到ESC才退出，
	while (true)
	{
		int key = waitKey();
		if (key == 27)
			break;
	}
	return 0;
}

在这里插入图片描述

6 图片的resize

在这里插入图片描述

7 图像直方图增强

在这里插入图片描述

有bug，待调试。。。。。。。！！！！！！！！！！！！！！！！！


#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include<vector>


using namespace std;
using namespace cv;

#define TILE 16



//step1：统计各个通道的灰度直方图
__global__ void hist_cal_kernel(unsigned char* input, int* hist, int width, int height, int stride)
{
	//每个像素对应一个线程，获取对应线程的x,y坐标，映射到图像上刚好对应图像的索引
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;
	int z = blockDim.z;
	int channels = stride / width; 

	extern __shared__ int hist_S[];

	//获取当前线程在线程块里面的ID，共享内存只能在block里面同步，16*16的一个线程块，刚好能存储长度为256的hist
	int tid = threadIdx.y * blockDim.x + threadIdx.x;
	hist_S[tid] = 0;  //初始化为0
	__syncthreads();

	if (x < width && y < height && z < channels)
	{
		int value = input[y * stride + x * channels + z];
		atomicAdd(&hist_S[value],1);
	}
	__syncthreads();

	//动态内存的大小是256*frame.channels()*sizeof(int) ，也就是总的有256*3个数据
	//红。绿，蓝的顺序，前256是红通道的数据，依次类推
	atomicAdd(&hist[z * 256 + tid], hist_S[tid]);
}

//step2:求出总点数
__global__ void hist_sum(int *hist)
{
	//这里相当于是一个block一个block的去完成各自的累加
	int tid = threadIdx.y * blockDim.x + threadIdx.x;
	//这里blockIdx.x相当于通道数，因为核函数调用的时候，对256*3个数设置的是3个256的一维的block,所以blockID相当于通道数
	int x = blockIdx.x;    
	 __shared__ int hist_S[256];//调用了共享内存但是没有调用动态内存

	hist_S[tid] = hist[256*x + tid];  //初始化为0
	__syncthreads();

	for (int s = 128; s > 0; s >> 1)
	{
		if (tid < s)
		{
			hist_S[tid] += hist[s + tid];
		}
		__syncthreads();
	}

	//计算Fi
	int total = hist_S[0];
	if (tid == 0)
	{
		int sum = 0;
		for (int i = 0; i < 256; i++)
		{
			sum += hist[x * 256 + i];
			hist[x * 256 + i] = (int)((float)255 * sum / total);
		}
	}
}


//step3:直方图均衡化
__global__ void hist_trans_kernel(unsigned char* input, unsigned char* output, int* hist, int width, int height, int stride)
{
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;
	int z = blockDim.z;
	int channels = stride/ width;

	if (x < width && y < height && z < channels)
	{
		int value = input[y * stride + x * channels + z];
		int F_value = hist[z * 256 + value];
		output[y * stride + channels * x + z] = F_value;
	}
}

int main()
{
	Mat frame = imread("I:\\ONE_grade\\SZ_code\\class\\CudaRuntime2\\bg.jpg");
	cout << frame.cols << "  " << frame.rows << "  " << frame.step << ",static_cast<int>(frame.step) = " << static_cast<int>(frame.step)<<endl;


	// ******************************** 调用openCV对RGB图像做直方图均衡化  ********************************
	Mat frame_hist_cpu, feame_hist_gpu(frame.rows, frame.cols, CV_8UC3);

    clock_t time_start, time_end;
	time_start = clock();
	vector<Mat> bgr_channels;
	split(frame, bgr_channels); //调用openCV自带函数将图像的三个通道分开
	for (int i = 0; i < frame.channels(); i++)
	{
		equalizeHist(bgr_channels[i], bgr_channels[i]);
	}
	merge(bgr_channels, frame_hist_cpu);
	time_end = clock();
	cout << " the CPU time = " << time_end - time_start << endl;
	
	// ******************************** 调用GPU对RGB图像做直方图均衡化  ********************************
	//分配GPU空间
	unsigned char* input, * output;
	int* hist;
	cudaMalloc((void**)&input, static_cast<int>(frame.step) * frame.rows * sizeof(unsigned char));
	cudaMalloc((void**)&output, static_cast<int>(feame_hist_gpu.step) * feame_hist_gpu.rows * sizeof(unsigned char));
	cudaMalloc((void**)&hist, frame.channels()*256*sizeof(int));

	//将数据从cpu拷贝到GPU
	cudaMemcpy(input, frame.data, static_cast<int>(frame.step) * frame.rows * sizeof(unsigned char), cudaMemcpyHostToDevice);

	time_start = clock();
	//gpu上处理数据
	dim3 blockdim(TILE, TILE);
	//下面是一种设置技巧，能刚好设置一个线程处理一个像素，尽可能减少浪费
	dim3 griddim((feame_hist_gpu.cols + TILE - 1) / TILE, (feame_hist_gpu.rows + TILE - 1) / TILE, feame_hist_gpu.channels());
	//调用了共享内存，所以参数有3个
	hist_cal_kernel << <griddim, blockdim,256*frame.channels()*sizeof(int) >> > (input, hist, frame.cols, frame.rows, static_cast<int>(frame.step));
	hist_sum << <frame.channels(), 256>> > (hist);
	hist_trans_kernel << < griddim, blockdim >> > (input, output, hist, frame.cols, frame.rows, static_cast<int>(frame.step));


	将GPU上处理完毕的数据从GPU拷贝到CPU
	cudaMemcpy(feame_hist_gpu.data, output, static_cast<int>(feame_hist_gpu.step) * feame_hist_gpu.rows * sizeof(unsigned char), cudaMemcpyDeviceToHost);

	time_end = clock();
	cout << " the GPU time = " << time_end - time_start << endl;


	namedWindow("原图", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("原图", frame);

	namedWindow("CPU 直方图均衡化的图", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("CPU 直方图均衡化的图", frame_hist_cpu);

	namedWindow("GPU 直方图均衡化的图", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("GPU 直方图均衡化的图", feame_hist_gpu);


	//如果想要实现按到ESC才退出，
	while (true)
	{
		int key = waitKey();
		if (key == 27)
			break;
	}
	return 0;
}

8 BGR转换成RGB

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include<vector>

using namespace std;
using namespace cv;

#define TILE 16

__global__ void bgrTOrgb_kernel(unsigned char* input, unsigned char* output, int width, int height, int stride_input, int stride_output)
{
/*	每个像素对应一个线程，获取对应线程的x,y坐标，映射到图像上刚好对应图像的索引
		每个像素对应一个线程，获取对应线程的x,y坐标，映射到图像上刚好对应图像的索引*/
	int x = blockDim.x * blockIdx.x + threadIdx.x;
	int y = blockDim.y * blockIdx.y + threadIdx.y;
	int z = blockDim.z;

	//int channels = stride_input / width; //如果等于1表示是灰度图，如果等于3是RGB图
	//
	//if (x < width && y < height && z < channels && channels == 3)
	//{
	//	int bgr_temp = input[y * stride_input + x * channels + z];
	//	output[y * stride_output + x * channels + (2- z)] = bgr_temp;
	//}
	int index_in = (y * width + x) * 3;
	int index_out = (y * width + x) * 3;

	if (x < width && y < height) {
		// BRG to RGB
		output[index_out] = input[index_in + 2]; // R
		output[index_out + 1] = input[index_in + 1]; // G
		output[index_out + 2] = input[index_in]; // B
	}
}

int main()
{
	Mat frame = imread("I:\\ONE_grade\\SZ_code\\class\\CudaRuntime2\\bg.jpg");
	cout << frame.cols << "  " << frame.rows << "  " << frame.step << ",static_cast<int>(frame.step) = " << static_cast<int>(frame.step) << endl;

	Mat pic_cpu, pic_gpu(frame.rows, frame.cols, CV_8UC3); //8bite, 灰度图是1通道

	//**********************************  CPU BGR转RGB  **************************** 

	clock_t time_start, time_end;
	time_start = clock();
	cvtColor(frame, pic_cpu, COLOR_BGR2RGB);
	time_end = clock();
	cout << " the cpu time = " << time_end - time_start << endl;

	**************************** GPU掩膜实现边缘提取   **************************** 
	//	//分配GPU空间
	unsigned char* input, * output;
	cudaMalloc((void**)&input, static_cast<int>(frame.step) * frame.rows * sizeof(unsigned char));
	cudaMalloc((void**)&output, static_cast<int>(pic_gpu.step) * pic_gpu.rows * sizeof(unsigned char));

	//将数据从cpu拷贝到GPU
	cudaMemcpy(input, frame.data, static_cast<int>(frame.step) * frame.rows * sizeof(unsigned char), cudaMemcpyHostToDevice);

	time_start = clock();
	//gpu上处理数据
	dim3 blockdim(TILE, TILE);
	//下面是一种设置技巧，能刚好设置一个线程处理一个像素，尽可能减少浪费
	dim3 griddim((pic_gpu.cols + TILE - 1) / TILE, (pic_gpu.rows + TILE - 1) / TILE);
	bgrTOrgb_kernel << <griddim, blockdim >> > (input, output, pic_gpu.cols, pic_gpu.rows, static_cast<int>(frame.step), static_cast<int>(pic_gpu.step));

	//将GPU上处理完毕的数据从GPU拷贝到CPU
	cudaMemcpy(pic_gpu.data, output, static_cast<int>(pic_gpu.step) * pic_gpu.rows * sizeof(unsigned char), cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	time_end = clock();
	cout << " the GPU time = " << time_end - time_start << endl;

	namedWindow("原图", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("原图", frame);

	namedWindow("cpu result", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("cpu result", pic_cpu);

	namedWindow("gpu result", WINDOW_NORMAL);  //给图片命名并设置可以随意调整大小
	imshow("gpu result", pic_gpu);

	//如果想要实现按到ESC才退出，
	while (true)
	{
		int key = waitKey();
		if (key == 27)
			break;
	}
	return 0;
}

在这里插入图片描述

我学习过程的思考

1. 为什么获取BGR图像的某个位置的像素索引要用 input[y * stride_input + x * channels + z];，而不是 y * stride_input + x，我画的下面这个图有什么问题吗

在这里插入图片描述

猪猪虾的业余生活

关注

9
点赞
踩
14

收藏

觉得还不错? 一键收藏
0
评论
cuda学习笔记（6） windows opencv安装及使用，常见图像处理操作的GPU实现

opencv官网下载opencv包，下载的时候经尽量不要选择带*版本的，这些版本还在持续更新中，然后直接双击按照提示进行安装即可，记住安装的位置目录-》应用（选vc14还是15和你安装的版本有关系）配置VS环境-》项目属性》包含目录：将两个路径添加进去和库目录：在下面路径里面添加到库目录里面点黄色的地方就可以进行编辑C/C++ ->链接期->t附加依赖项目-》将目录下I的添加进入环境。
复制链接

扫一扫

专栏目录