图片的加权叠加:有无使用纹理内存的GPU下运行和CPU下运行的性能比较。
参考了:
https://blog.csdn.net/langzai310/article/details/83573278
https://blog.csdn.net/shuzfan/article/details/77095270
https://learnopengl-cn.readthedocs.io/zh/latest/01%20Getting%20started/06%20Textures/
#include <opencv2\opencv.hpp>
#include <iostream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
using namespace std;
using namespace cv;
// 打印设备信息
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %d.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %d.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %d.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}
// 初始化cuda
bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "three is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printDeviceProp(prop);
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) { break; }
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
//声明CUDA纹理
//texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex1;
//texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex2;
texture <uchar4, cudaTextureType2D, cudaReadModeElementType> refTex1;
texture <uchar4, cudaTextureType2D, cudaReadModeElementType> refTex2;//数据类型不转换
//声明CUDA数组
cudaArray* cuArray1;
cudaArray* cuArray2;
//通道数
cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc<uchar4>();
#define PAUSE printf("Press Enter key to continue..."); fgetc(stdin);
//1
__global__ void weightAddKerkel(uchar *pDstImgData, int imgHeight, int imgWidth, int channels)
{
const int tidx = blockDim.x*blockIdx.x + threadIdx.x;
const int tidy = blockDim.y*blockIdx.y + threadIdx.y;
if (tidx<imgWidth && tidy<imgHeight)
{
uchar4 lenaBGR, moonBGR;
//pDstImgData[1] = 200;
//使用tex2D函数采样纹理 并行采样 又名纹理拾取
lenaBGR = tex2D(refTex1, tidx, tidy);
moonBGR = tex2D(refTex2, tidx, tidy);
//pDstImgData[2] = 300;
int idx = (tidy*imgWidth + tidx)*channels;
float alpha = 0.5;
pDstImgData[idx + 0] = (alpha*lenaBGR.x + (1 - alpha)*moonBGR.x);//*255;
pDstImgData[idx + 1] = (alpha*lenaBGR.y + (1 - alpha)*moonBGR.y);//*255;
pDstImgData[idx + 2] = (alpha*lenaBGR.z + (1 - alpha)*moonBGR.z);//*255;
//pDstImgData[idx + 0] = (lenaBGR.x + moonBGR.x) / 2;
//pDstImgData[idx + 1] = (lenaBGR.y + moonBGR.y) / 2;
//pDstImgData[idx + 2] = (lenaBGR.z + moonBGR.z) / 2;
pDstImgData[idx + 3] = 0;
}
}
//2
__global__ void weightAddsimply(uchar *simply_d, uchar *lena_d, uchar *moon_d, int imgHeight, int imgWidth, int channels)
{
const int tidx = blockDim.x*blockIdx.x + threadIdx.x;
const int tidy = blockDim.y*blockIdx.y + threadIdx.y;
float alpha = 0.5;
if (tidx < imgWidth && tidy < imgHeight) {
if (tidx == 0 || tidy == 0) {
//int idx1 = (tidy + 1)*imgWidth + tidx;
//int idx2 = (tidy + 1)*imgWidth + tidx + 1;
//int idx3 = (tidy)*imgWidth + tidx + 1;
int idx = (tidy*imgWidth + tidx)*channels;
simply_d[idx+0] = (lena_d[idx+0])* alpha + (moon_d[idx + 0]) * (1 - alpha);
simply_d[idx+1] = (lena_d[idx+1])* alpha + (moon_d[idx + 1]) * (1 - alpha);
simply_d[idx+2] = (lena_d[idx + 2])* alpha + (moon_d[idx+2]) * (1 - alpha);
simply_d[idx+3] = 0;
}
else {
int idx1 = ((tidy - 1)*imgWidth + tidx)*channels;
int idx2 = ((tidy - 1)*imgWidth + tidx - 1)*channels;
int idx3 = ((tidy)*imgWidth + tidx - 1)*channels;
int idx = (tidy*imgWidth + tidx)*channels;
simply_d[idx+0] = (lena_d[idx1 + 0] + lena_d[idx2 + 0] + lena_d[idx3 + 0] + lena_d[idx + 0])* alpha / 4
+ (moon_d[idx1 + 0] + moon_d[idx2 + 0] + moon_d[idx3 + 0] + moon_d[idx + 0])*(1 - alpha) / 4;
simply_d[idx+1] = (lena_d[idx1 + 1] + lena_d[idx2 + 1] + lena_d[idx3 + 1] + lena_d[idx+1])* alpha / 4
+ (moon_d[idx1 + 1] + moon_d[idx2 + 1] + moon_d[idx3 + 1] + moon_d[idx + 1])*(1 - alpha) / 4;
simply_d[idx + 2] = (lena_d[idx1 + 2] + lena_d[idx2 + 2] + lena_d[idx3 + 2] + lena_d[idx+2])* alpha / 4
+ (moon_d[idx1 + 2] + moon_d[idx2 + 2] + moon_d[idx3 + 2] + moon_d[idx + 2])*(1 - alpha) / 4;
simply_d[idx+3] = 0;
}
}
}
/*
__global__ void weightAddsimply(uchar4 *simply_d, uchar4 *lena_d, uchar4 *moon_d, int imgHeight, int imgWidth, int channels)
{
const int tidx = blockDim.x*blockIdx.x + threadIdx.x;
const int tidy = blockDim.y*blockIdx.y + threadIdx.y;
float alpha = 0.5;
if (tidx<imgWidth && tidy<imgHeight)
{
if (tidx == 0 || tidy == 0) {
//int idx1 = (tidy + 1)*imgWidth + tidx;
//int idx2 = (tidy + 1)*imgWidth + tidx + 1;
//int idx3 = (tidy)*imgWidth + tidx + 1;
int idx = tidy*imgWidth + tidx;
simply_d[idx].x = (lena_d[idx].x)* alpha + (moon_d[idx].x) * (1 - alpha);
simply_d[idx].y = (lena_d[idx].y)* alpha + (moon_d[idx].y) * (1 - alpha);
simply_d[idx].z = (lena_d[idx].z)* alpha + (moon_d[idx].z) * (1 - alpha);
simply_d[idx].w = 0;
}
else {
int idx1 = (tidy - 1)*imgWidth + tidx;
int idx2 = (tidy - 1)*imgWidth + tidx - 1;
int idx3 = (tidy)*imgWidth + tidx - 1;
int idx = tidy*imgWidth + tidx;
simply_d[idx].x = (lena_d[idx1].x+ lena_d[idx2].x+lena_d[idx3].x+ lena_d[idx].x)* alpha/4
+ (moon_d[idx1].x+ moon_d[idx2].x+ moon_d[idx3].x+ moon_d[idx].x)*(1 - alpha)/4;
simply_d[idx].y = (lena_d[idx1].y + lena_d[idx2].y + lena_d[idx3].y + lena_d[idx].y)* alpha / 4
+ (moon_d[idx1].y + moon_d[idx2].y + moon_d[idx3].y + moon_d[idx].y)*(1 - alpha) / 4;
simply_d[idx].z = (lena_d[idx1].z + lena_d[idx2].z + lena_d[idx3].z + lena_d[idx].z)* alpha / 4
+ (moon_d[idx1].z + moon_d[idx2].z + moon_d[idx3].z + moon_d[idx].z)*(1 - alpha) / 4;
simply_d[idx].w = 0;
}
}
}
*/
void cpu_dejia(unsigned char * d_in1, unsigned char * d_in2, unsigned char * d_out, int imgHeight, int imgWidth, int channels) {
float alpha = 0.5;
for (int i = 0;i < imgHeight;i++) {
for (int j = 0;j < imgWidth;j++) {
for (int k = 0;k < channels;k++) {
int temp = i*imgWidth*channels + j*channels + k;
d_out[temp] = alpha*d_in1[temp] + d_in2[temp] * (1 - alpha);
}
}
}
}
void main()
{
//获取设备信息和初始化
if (!InitCUDA()) return ;
//开始
//PAUSE;
Mat Lena = imread("D:\\laopo\\beauty\\test.jpg");
Mat moon = imread("D:\\laopo\\beauty\\test2.jpg");
//namedWindow("show1");
//imshow("show1", Lena);
//namedWindow("show2");
//imshow("show2", moon);
// 4通道的图像是RGBA,是RGB加上一个A通道,也叫alpha通道,表示透明度。
//PNG图像是一种典型的4通道图像。alpha通道可以赋值0到1,或者0到255,表示透明到不透明。
cvtColor(Lena, Lena, CV_BGR2BGRA);
cvtColor(moon, moon, CV_BGR2BGRA);
int imgWidth = Lena.cols;
int imgHeight = Lena.rows;
int channels = Lena.channels();
clock_t start, end;
//设置纹理属性
cudaError_t t;
refTex1.addressMode[0] = cudaAddressModeClamp;
refTex1.addressMode[1] = cudaAddressModeClamp;
refTex1.normalized = false; //没有归一化坐标
refTex1.filterMode = cudaFilterModePoint;
//refTex1.filterMode = cudaFilterModeLinear;
//绑定纹理内存的数据,从全局内存到纹理内存的关联
cudaMallocArray(&cuArray1, &cuDesc, imgWidth, imgHeight);
t = cudaBindTextureToArray(refTex1, cuArray1);
refTex2.addressMode[0] = cudaAddressModeClamp;
refTex2.addressMode[1] = cudaAddressModeClamp;
refTex2.normalized = false;
//线性滤波
refTex1.filterMode = cudaFilterModePoint;
//refTex2.filterMode = cudaFilterModeLinear;
cudaMallocArray(&cuArray2, &cuDesc, imgWidth, imgHeight);
t = cudaBindTextureToArray(refTex2, cuArray2);
//拷贝数据到cudaArray
t = cudaMemcpyToArray(cuArray1, 0, 0, Lena.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
t = cudaMemcpyToArray(cuArray2, 0, 0, moon.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
//输出图像组
Mat dstImg = Mat::zeros(imgHeight, imgWidth, CV_8UC4);
uchar *pDstImgData = NULL;
t = cudaMalloc(&pDstImgData, imgHeight*imgWidth * sizeof(uchar)*channels);
//核函数,实现两幅图像加权和
dim3 block(32, 32);
dim3 grid((imgWidth + block.x - 1) / block.x, (imgHeight + block.y - 1) / block.y);
//1
start = clock();
weightAddKerkel << <grid, block>> >(pDstImgData, imgHeight, imgWidth, channels);
//保证所有的线程都已经执行完了kernel function
cudaThreadSynchronize();
end = clock();
double time1 = (double)(end - start) / CLOCKS_PER_SEC;
printf("GPU use texture exec time is %.8f s\n", time1);
//从GPU拷贝输出数据到CPU
t = cudaMemcpy(dstImg.data, pDstImgData, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyDeviceToHost);
//显示
namedWindow("show1");
imshow("show1", dstImg);
cudaUnbindTexture(refTex1);
cudaUnbindTexture(refTex2);
//2
Mat dstImg2 = Mat::zeros(imgHeight, imgWidth, CV_8UC4);
//uchar4 *lena_d, *moon_d;
//uchar4 *simply_d = NULL;
uchar *lena_d, *moon_d;
uchar *simply_d = NULL;
cudaMalloc(&simply_d, imgHeight*imgWidth * sizeof(uchar)*channels);
cudaMalloc(&lena_d, imgHeight*imgWidth * sizeof(uchar)*channels);
cudaMalloc(&moon_d, imgHeight*imgWidth * sizeof(uchar)*channels);
cudaMemcpy(lena_d, Lena.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
cudaMemcpy(moon_d, moon.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
start = clock();
weightAddsimply << <grid, block >> > (simply_d, lena_d, moon_d, imgHeight, imgWidth, channels);
cudaThreadSynchronize();
end = clock();
cudaMemcpy(dstImg2.data, simply_d, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyDeviceToHost);
double time2 = (double)(end - start) / CLOCKS_PER_SEC;
printf("GPU didn't use texture exec time is %.8f s\n", time2);
//显示
namedWindow("show2");
imshow("show2", dstImg2);
//3
Mat hechengImage = Mat::zeros(imgHeight, imgWidth, CV_8UC4);
start = clock();
cpu_dejia(Lena.data, moon.data, hechengImage.data, imgHeight, imgWidth, channels);
end = clock();
double time3 = (double)(end - start)/CLOCKS_PER_SEC;
printf("CPU exec time is %.8f s\n", time3);
namedWindow("hecheng");
imshow("hecheng", hechengImage);
double timefast = time3 - time1;
printf("GPU纹理内存下的处理速度比CPU下快 %.8f s\n",timefast);
//解除纹理内存和cuda数组绑定
cudaUnbindTexture(refTex1);
cudaUnbindTexture(refTex2);
cudaFreeArray(cuArray1);
cudaFreeArray(cuArray2);
//cudaFree(pDstImgData);
cudaFree(simply_d);
cudaFree(lena_d);
cudaFree(moon_d);
//如果设置waitKey(0),则表示程序会无限制的等待用户的按键事件
//waitKey()--这个函数是在一个给定的时间内(单位ms)等待用户按键触发;如果用户没有按下 键,则接续等待(循环)
waitKey(0);
}