#include <iostream>
#include <string>
#include <cassert>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#define checkCudaErrors(val) check((val),#val,__FILE__,__LINE__)
cv::Mat imageRGBA;
cv::Mat imageGrey;
//声明GPU memory
uchar4 *d_rgbaImage__;
uchar *d_greyImage__;
size_t numRows() {
return imageRGBA.rows;
}
size_t numCols() {
return imageRGBA.cols;
}
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at:" << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
//图片预处理
void preProcess(uchar4 **inputImage, unsigned char **greyImage, uchar4 **d_rgbaImage,
unsigned char **d_greyImage, const std::string &filename) {
checkCudaErrors(cudaFree(0));
//读取图片
cv::Mat image;
image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
if (image.empty()) {
std::cerr << "Couldn't open file:" << filename << std::endl;
exit(1);
}
//把opencv读取的BGR格式转为RGBA格式
cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
//生成一个和原图一样大小的imageGrey
imageGrey.create(image.rows, image.cols, CV_8UC1);
//判断图像是否连续存放
if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
std::cerr << "Images aren't continuous!! Exiting." << std::endl;
exit(1);
}
//inputImage指向imageRGBA
*inputImage = (uchar4 *)imageRGBA.ptr<unsigned char>(0);
//greyImage指向imageGrey
*greyImage = imageGrey.ptr<unsigned char>(0);
//分配GPU memory
const size_t numPixels = numRows()*numCols();
checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4)*numPixels));
checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char)*numPixels));
//cudaMemset在GPU上清空d_greyImage
checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char)));
//把inputImage的数据复制给GPU的d_rgbaImage
checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4)*numPixels, cudaMemcpyHostToDevice));
d_rgbaImage__ = *d_rgbaImage;
d_greyImage__ = *d_greyImage;
}
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) {
int threadId = blockIdx.x*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
if (threadId < numRows*numCols) {
const unsigned char R = rgbaImage[threadId].x;
const unsigned char G = rgbaImage[threadId].y;
const unsigned char B = rgbaImage[threadId].z;
greyImage[threadId] = .299f*R + .587f*G + .114f*B;
}
}
void postProcess(const std::string& output_file, unsigned char* data_ptr) {
cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
cv::imwrite(output_file.c_str(), output);
}
void cleanup() {
cudaFree(d_rgbaImage__);
cudaFree(d_greyImage__);
}
int main(int argc, char* argv[]) {
//定义输入地址
std::string input_file = "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small.jpg";
//定义输出地址
std::string output_file = "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small_togray.jpg";
//定义Host的指针
uchar4 *h_rgbaImage, *d_rgbaImage;
//定义device的指针
unsigned char *h_greyImage, *d_greyImage;
//图片预处理(把要处理的数据赋值给h_rgbaImage,且复制给d_greyImage)
preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
//并行化处理Kernel
int thread = 16;
int grid = (numRows()*numCols() + thread - 1) / (thread*thread);
const dim3 blockSize(thread, thread);
const dim3 gridSize(grid);
rgba_to_greyscale <<<gridSize, blockSize >>> (d_rgbaImage, d_greyImage, numRows(), numCols());
//只有GPU计算到这个位置后,CPU才会开始接着
cudaDeviceSynchronize();
//GPU结果复制给CPU
size_t numPixels = numRows()*numCols();
checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char)*numPixels, cudaMemcpyDeviceToHost));
//写入图片
postProcess(output_file, h_greyImage);
//释放
cleanup();
}
仅记录学习过程