NPP
#include <iostream>
#include <opencv2/opencv.hpp>
#include <npp.h>
#include <nppi.h>
int main() {
cv::Mat inputImage = cv::imread("input_image.jpg", cv::IMREAD_COLOR);
if (inputImage.empty()) {
std::cerr << "Error: Could not read the image file." << std::endl;
return -1;
}
int width = inputImage.cols;
int height = inputImage.rows;
int size = width * height * inputImage.channels();
unsigned char *inputImagePtr, *outputImagePtr;
unsigned char *dev_inputImage, *dev_outputImage;
inputImagePtr = inputImage.data;
outputImagePtr = new unsigned char[size];
cudaMalloc(&dev_inputImage, size);
cudaMalloc(&dev_outputImage, size);
cudaMemcpy(dev_inputImage, inputImagePtr, size, cudaMemcpyHostToDevice);
NppiSize imageSize = { width, height };
int srcStep = width * inputImage.channels();
int dstStep = width * sizeof(unsigned char);
// Convert to grayscale using NPP
NppiSize roiSize = { width, height };
nppiRGBToGray_8u_C3C1R(dev_inputImage, srcStep, dev_outputImage, dstStep, roiSize);
cudaMemcpy(outputImagePtr, dev_outputImage, size, cudaMemcpyDeviceToHost);
cv::Mat outputImage(height, width, CV_8UC1, outputImagePtr);
cv::imshow("Input Image", inputImage);
cv::imshow("Output Image (Grayscale)", outputImage);
cv::waitKey(0);
cudaFree(dev_inputImage);
cudaFree(dev_outputImage);
delete[] outputImagePtr;
return 0;
}
##核函数
#include <iostream>
#include <opencv2/opencv.hpp>
//核函数
__global__ void grayscale(unsigned char *inputImage, unsigned char *outputImage, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
int tid = y * width + x;
outputImage[tid] = 0.299f * inputImage[3 * tid] + 0.587f * inputImage[3 * tid + 1] + 0.114f * inputImage[3 * tid + 2];
}
}
int main() {
cv::Mat inputImage = cv::imread("input_image.jpg", cv::IMREAD_COLOR);
if (inputImage.empty()) {
std::cerr << "Error: Could not read the image file." << std::endl;
return -1;
}
int width = inputImage.cols;
int height = inputImage.rows;
int size = width * height * inputImage.channels();
unsigned char *inputImagePtr, *outputImagePtr;
unsigned char *dev_inputImage, *dev_outputImage;
inputImagePtr = inputImage.data;
outputImagePtr = new unsigned char[size];
/* 申请设备内存 */
cudaMalloc(&dev_inputImage, size);
cudaMalloc(&dev_outputImage, size);
/* 将主机内存数据复制到设备内存 */
cudaMemcpy(dev_inputImage, inputImagePtr, size, cudaMemcpyHostToDevice);
/*设置设备的线程数,并调用核函数*/
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((width + threadsPerBlock.x - 1) / threadsPerBlock.x, (height + threadsPerBlock.y - 1) / threadsPerBlock.y);
grayscale<<<numBlocks, threadsPerBlock>>>(dev_inputImage, dev_outputImage, width, height);
/*将设备内存数据复制到主机内存 */
cudaMemcpy(outputImagePtr, dev_outputImage, size, cudaMemcpyDeviceToHost);
cv::Mat outputImage(height, width, CV_8UC1, outputImagePtr);
cv::imshow("Input Image", inputImage);
cv::imshow("Output Image (Grayscale)", outputImage);
cv::waitKey(0);
/* 释放设备内存 */
cudaFree(dev_inputImage);
cudaFree(dev_outputImage);
delete[] outputImagePtr;
return 0;
}