#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdlib.h"
#include <string>
#include <cassert>
#include <iostream>
using namespace std;
#include <opencv2\opencv.hpp>
#include <opencv2/core.hpp>
#include <opencv2/highgui/highgui.hpp>
using namespace cv;
#include<stdlib.h>
void GetCudaCalError(cudaError err)
{
if (err != cudaSuccess)
{
cout << "分配内存失败!程序结束!";
}
return;
}
//返回thread和block
int getThreadNumZC()
{
cudaDeviceProp prop;//cudaDeviceProp的一个对象
int count = 0;//GPU的个数
cudaGetDeviceCount(&count);
std::cout << "gpu 的个数:" << count << '\n';
cudaGetDeviceProperties(&prop, 0);//第二参数为那个gpu
cout << "最大线程数:" << prop.maxThreadsPerBlock << endl;
cout << "最大网格类型:" << prop.maxGridSize[0] << '\t' << prop.maxGridSize[1] << '\t' << prop.maxGridSize[2] << endl;
return prop.maxThreadsPerBlock;
}
//对照片进行卷积操作
__global__ void convZC(uchar4*d_image, float*d_kernel, uchar4*d_result,int imageRow, int imageCol, int kernelSize)
{
//这里block使用一维的
//获取Thread的id
const int id = blockIdx.x*blockDim.x + threadIdx.x;
//判断id是否超出边界,如果超出则不用这个线程
if (id < imageRow*imageCol)
{
//获取当前的行和列
const int row = id / imageCol;
const int col = id % imageCol;
//每个通道都做卷积计算(这个地方可以进一步做并行化处理)
for (int i = 0; i < kernelSize; ++i)
{
for (int j = 0; j < kernelSize; ++j)
{
float3 imgValue = {0,0,0};//记录结果
int curRow = row - kernelSize / 2 + i;
int curClo = col - kernelSize / 2 + j;
if (curRow < 0 || curClo < 0 || curRow >= imageRow || curClo >= imageCol)
{
}
else
{
imgValue.x = d_image[curRow*imageCol + curClo].x;
imgValue.y = d_image[curRow*imageCol + curClo].y;
imgValue.z = d_image[curRow*imageCol + curClo].z;
}
d_result[id].x += d_kernel[i*kernelSize + j] * imgValue.x;
d_result[id].y += d_kernel[i*kernelSize + j] * imgValue.y;
d_result[id].z += d_kernel[i*kernelSize + j] * imgValue.z;
}
}
}
}
//将照片均值模糊化
__global__ void avgImage(uchar4*d_result, float*d_kernel, uchar4*d_result_image,int imageRow, int imageCol, int kernelSize)
{
//这里block使用一维的
//获取Thread的id
const int id = blockIdx.x*blockDim.x + threadIdx.x;
//判断id是否超出边界,如果超出则不用这个线程
if (id < imageRow*imageCol)
{
//获取当前的行和列
const int row = id / imageCol;
const int col = id % imageCol;
//每个通道都做卷积计算(这个地方可以进一步做并行化处理)
for (int i = 0; i < kernelSize; ++i)
{
for (int j = 0; j < kernelSize; ++j)
{
float3 imgValue = { 0,0,0 };//记录结果
int curRow = row - kernelSize / 2 + i;
int curClo = col - kernelSize / 2 + j;
if (curRow < 0 || curClo < 0 || curRow >= imageRow || curClo >= imageCol)
{
}
else
{
imgValue.x = d_result[curRow*imageCol + curClo].x;
imgValue.y = d_result[curRow*imageCol + curClo].y;
imgValue.z = d_result[curRow*imageCol + curClo].z;
}
d_result_image[id].x += d_kernel[i*kernelSize + j] * imgValue.x;
d_result_image[id].y += d_kernel[i*kernelSize + j] * imgValue.y;
d_result_image[id].z += d_kernel[i*kernelSize + j] * imgValue.z;
}
}
d_result_image[id].x /= kernelSize * kernelSize;
d_result_image[id].y /= kernelSize * kernelSize;
d_result_image[id].z /= kernelSize * kernelSize;
}
}
void showImageZC(string filename,uchar4 *Image, int imageRow, int imageClo)
{
//将数组转换成Mat
cv::Mat outImage(imageRow, imageClo, CV_8UC4, (void*)Image);
cv::Mat outImageBGR;
cv::cvtColor(outImage, outImageBGR, CV_RGBA2BGR);
string file = "E:\\ZC\\procedure\\CUDA\\Images\\";
file += filename;
cv::imwrite(file.c_str(), outImageBGR);
//显示处理好的照片
imshow("convImage", outImageBGR);
waitKey(0);
}
int main()
{
//定义变量
string input_file = "E:\\ZC\\procedure\\CUDA\\Images\\1.png";
string output_file = "E:\\ZC\\procedure\\CUDA\\Images\\3.png";
uchar4*h_image, *d_image, *d_result, *h_result, *d_avgImage, *h_avgImage;
float*d_kernel;
int imageRow, int imageCol, kernelSize = 3;
//读取照片到imageBGR中
Mat imageBGR = cv::imread(input_file.c_str(), CV_LOAD_IMAGE_COLOR);
if (imageBGR.empty())
{
cerr << "读取照片失败:" << input_file << endl;
exit(1);
}
//将BGR转换成RGB存到imageRGB中
Mat imageRGB;
cv::cvtColor(imageBGR, imageRGB, CV_BGR2RGBA);
//将Mat转换成数组并将地址赋给h_image
h_image = (uchar4*)imageRGB.ptr<unsigned char>(0);
//为Device上的d_image开辟空间
imageRow = imageRGB.rows;
imageCol = imageRGB.cols;
int size = imageCol * imageRow;
GetCudaCalError(cudaMalloc(&d_image, size * sizeof(uchar4)));
//将h_image的值赋给d_image
cudaMemcpy(d_image, h_image, size * sizeof(uchar4), cudaMemcpyHostToDevice);
//为Host上的h_kernel开辟空间
float *h_kernel = new float[kernelSize*kernelSize];
//为h_kernel赋值
for (int i = 0; i < kernelSize*kernelSize; ++i)
{
h_kernel[i] = i % kernelSize - 1;
}
//为Device上的d_kernel开辟空间
GetCudaCalError(cudaMalloc(&d_kernel, kernelSize *kernelSize * sizeof(float)));
//将h_kernel的值赋给d_kernel
cudaMemcpy(d_kernel, h_kernel, kernelSize *kernelSize * sizeof(float), cudaMemcpyHostToDevice);
//开辟一个和imageRGB等大的内存来存放卷积结果d_result
GetCudaCalError(cudaMalloc(&d_result, size * sizeof(uchar4)));
//d_result初始化成0
cudaMemset(d_result, 0, size * sizeof(uchar4));
//开辟一个和imageRGB等大的内存来存放卷积结果h_result
h_result = new uchar4[size];
//开辟一个和imageRGB等大的内存来存放均值处理结果h_avgImage
h_avgImage = new uchar4[size];
//开辟一个和imageRGB等大的内存来存放均值处理结果d_avgImage
GetCudaCalError(cudaMalloc(&d_avgImage, size * sizeof(uchar4)));
const int threadNum = getThreadNumZC();
const int blockNum = (imageRow*imageCol + threadNum - 1) / threadNum;
convZC << <blockNum, threadNum >> > (d_image, d_kernel, d_result, imageRow, imageCol, kernelSize);
//等待线程全部结束
cudaDeviceSynchronize();
//将结果返回Host上
cudaMemcpy(h_result, d_result, imageRow*imageCol * sizeof(uchar4), cudaMemcpyDeviceToHost);
for (int i = 0; i < kernelSize*kernelSize; ++i)
{
h_kernel[i] = rand() % 3;
}
cudaMemcpy(d_kernel, h_kernel, kernelSize *kernelSize * sizeof(float), cudaMemcpyHostToDevice);
avgImage << <blockNum, threadNum >> > (d_result, d_kernel, d_avgImage, imageRow, imageCol, kernelSize);
cudaDeviceSynchronize();
//将结果返回Host上
cudaMemcpy(h_avgImage, d_avgImage, imageRow*imageCol * sizeof(uchar4), cudaMemcpyDeviceToHost);
string name1 = "convTest2.png";
string name2 = "avgTest2.png";
showImageZC(name1, h_result, imageRow, imageCol);
showImageZC(name2, h_avgImage, imageRow, imageCol);
//释放内存
cudaFree(d_image);
cudaFree(d_avgImage);
cudaFree(d_kernel);
delete[] h_avgImage;
return 0;
}
不足之处:
1、卷积,均值池化都不能调步数。
2、均值池化没有改变像素的个数。
3、这个并行化程序的并行化程度还可以再提高。
4、代码不方便迁移。