#include <stdio.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cassert>
#include <time.h>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <cuda_runtime.h>
using namespace cv;
void NL_mean_cpu(Mat src, Mat& dst, double h, int halfKernelSize, int halfSearchSize)
{
Mat boardSrc;
dst.create(src.rows, src.cols, CV_8UC1);
int boardSize = halfKernelSize + halfSearchSize;
copyMakeBorder(src, boardSrc, boardSize, boardSize, boardSize, boardSize, BORDER_REFLECT); //边界扩展
double h2 = h * h;
int rows = src.rows; // 图像的宽度
int cols = src.cols; // 图像的高度
for (int j = boardSize; j < boardSize + rows; j++)
{
for (int i = boardSize; i < boardSize + cols; i++)
{
double sumw = 0;
double p = 0;
for (int sr = -halfSearchSize; sr <= halfSearchSize; sr++)
{
for (int sc = -halfSearchSize; sc <= halfSearchSize; sc++)
{
double sum = 0.0;
for (int jj = -halfKernelSize; jj <= halfKernelSize; jj++)
{
for (int ii = -halfKernelSize; ii <= halfKernelSize; ii++)
{
/*uchar dataA = boardSrc.at<uchar>(j + jj, i + ii);
uchar dataB = boardSrc.at<uchar>(j + sr + jj, i + sc + ii);*/
uchar dataA = boardSrc.data[(j + jj) * boardSrc.cols + ii + i];
uchar dataB = boardSrc.data[(j + jj + sr) * boardSrc.cols + sc + ii + i];
int diff = dataA - dataB;
sum += diff * diff;
}
}
sum = sum / ((2 * halfKernelSize + 1) * (2 * halfKernelSize + 1));
double d2 = sum;
double w = exp(-d2 / h2); // 权重计算
/*p += boardSrc.at<uchar>(j + sr, i + sc) * w;*/
p += boardSrc.data[(j + sr) * (boardSrc.cols) + i + sc] * w;
sumw += w;
}
}
dst.data[(j - boardSize)*cols +i - boardSize] = saturate_cast<uchar>(p / sumw);
}
}
}
__global__ void NL_mean_kernel_gpu(const uchar* boardSrc, uchar* dst,
int rows, int cols,
double h, int halfKernelSize,
int halfSearchSize) {
int j = blockIdx.y * blockDim.y + threadIdx.y; // Vertical index
int i = blockIdx.x * blockDim.x + threadIdx.x; // Horizontal index
if (j < rows && i < cols) {
int boardHeight = 2*(halfKernelSize + halfSearchSize) + rows; // Extended height
int boardWidth = 2*(halfKernelSize + halfSearchSize) + cols; // Extended width
int indexA_i = i + halfKernelSize + halfSearchSize; // Index for patch A in extended boardSrc
int indexA_j = j + halfKernelSize + halfSearchSize; // Index for patch A in extended boardSrc
double weight = 0;
double p = 0;
double sumw = 0;
double h2 = h * h;
for (int sr = -halfSearchSize; sr <= halfSearchSize; sr++) { // Vertical search range
for (int sc = -halfSearchSize; sc <= halfSearchSize; sc++) { // Horizontal search range
// Compute d2
int indexB_i = indexA_i + sc; // Horizontal index for patch B
int indexB_j = indexA_j + sr; // Vertical index for patch B
float d2 = 0.0f;
for (int jj = -halfKernelSize; jj <= halfKernelSize; jj++) {
for (int ii = -halfKernelSize; ii <= halfKernelSize; ii++) {//邻域窗口
int dataA = boardSrc[(indexA_j + jj) * boardWidth + indexA_i + ii];
int dataB = boardSrc[(indexB_j + jj) * boardWidth + indexB_i + ii];
int diff = dataA - dataB;//绝对值
d2 += (double)(diff * diff);//平方
}
}
d2 /= ((2 * halfKernelSize+1 ) * (2 * halfKernelSize+1));
weight = exp(-d2 / h2); // Weight for the current patch B
p += boardSrc[indexB_j * boardWidth + indexB_i] * weight;
sumw += weight;
}
}
dst[j * cols + i] = (uchar)(p / sumw); // Assign filtered value to output
}
}
void NL_mean_gpu(Mat src, Mat& dst, double h, int halfKernelSize, int halfSearchSize) {
Mat boardSrc;
dst.create(src.rows, src.cols, CV_8UC1);//为输出分配空间
int boardSize = halfKernelSize + halfSearchSize;
copyMakeBorder(src, boardSrc, boardSize, boardSize, boardSize, boardSize, BORDER_REFLECT); //边界扩展
int rows = src.rows;//宽
int cols = src.cols;//高
//定义GPU变量
uchar* boardSrc_d;//gpu扩展后的图
cudaMalloc((void**)&boardSrc_d, (boardSrc.rows) * (boardSrc.cols) * sizeof(uchar));
cudaMemcpy(boardSrc_d, boardSrc.data, boardSrc.rows * boardSrc.cols * sizeof(uchar), cudaMemcpyHostToDevice);
uchar* dst_d;//最终的结果图
cudaMalloc((void**)&dst_d, (src.rows) * (src.cols) * sizeof(uchar));
//分块
dim3 block(16, 16);
dim3 grid((src.cols + block.x - 1) / block.x, (src.rows + block.y - 1) / block.y);
NL_mean_kernel_gpu<<<grid,block>>>(boardSrc_d,dst_d,src.rows,src.cols,h,halfKernelSize,halfSearchSize);
cudaMemcpy(dst.data, dst_d, (src.rows) * (src.cols) * sizeof(uchar), cudaMemcpyDeviceToHost);
}
void nlmean_test(void)
{
Mat img = imread("C:\\Users\\ASUS\\Desktop\\高性能大作业\\lena_256.png", cv::IMREAD_GRAYSCALE);
// 计算噪声像素数量
int noisePercentage = 10;
int noisePixels = 0.1 * img.rows * img.cols;
// 在随机位置添加噪声
for (int i = 0; i < noisePixels; ++i) {
int row = rand() % img.rows;
int col = rand() % img.cols;
int noiseValue = rand() % 256; // 随机生成0到255之间的噪声值
img.at<uchar>(row, col) = noiseValue; // 设置噪声像素的灰度值
}
imshow("加噪图片",img);
Mat out_gpu;
Mat out_cpu;
// 使用计时器开始测量时间
double start = static_cast<double>(cv::getTickCount());
// 调用NL_mean_gpu函数
NL_mean_gpu(img, out_gpu, 20, 3, 15);
// 使用计时器停止测量时间
double end = static_cast<double>(cv::getTickCount());
// 计算运行时间(以秒为单位)
double elapsedTime = (end - start) / cv::getTickFrequency();
// 打印运行时间
std::cout << "NL_mean_gpu函数的运行时间: " << elapsedTime << " 秒" << std::endl;
imshow("非局部均值滤波_gpu", out_gpu);
// 使用计时器开始测量时间
double start_ = static_cast<double>(cv::getTickCount());
// 调用NL_mean_cpu函数
NL_mean_cpu(img, out_cpu, 20, 3, 15);
// 使用计时器停止测量时间
double end_ = static_cast<double>(cv::getTickCount());
// 计算运行时间(以秒为单位)
elapsedTime = (end_ - start_) / cv::getTickFrequency();
// 打印运行时间
std::cout << "NL_mean_cpu函数的运行时间: " << elapsedTime << " 秒" << std::endl;
imshow("非局部均值滤波_cpu", out_cpu);
waitKey();
}
int main()
{
nlmean_test();
return 0;
}
这是西电的高性能计算实验大作业,学弟学妹可以私聊我要报告,代码可以直接运行