5.2 一维网格上的导数计算

最新推荐文章于 2022-12-30 02:43:12 发布

JennyBi

最新推荐文章于 2022-12-30 02:43:12 发布

阅读量616

点赞数

分类专栏： gpu计算文章标签： CUDA

本文链接：https://blog.csdn.net/JennyBi/article/details/82144318

版权

gpu计算专栏收录该内容

7 篇文章 0 订阅

订阅专栏

kernel.h

#pragma once
#ifndef KERNEL_H
#define KERNEL_H

void ddParallel(float *out, const float *in, int n, float h);

#endif // !KERNEL_H

kernel.cu

#include "kernel.h"
#define TPB 64

__global__ void ddKernel(float *d_out, const float *d_in, int size, float h)
{
	const int i = threadIdx.x*blockDim.x + threadIdx.x;
	if (i >= size) return;
	d_out[i] = (d_in[i - 1] - 2.f*d_in[i] + d_in[i + 1]) / (h*h);

}

void ddParallel(float *out, const float *in, int n, float h)
{
	float *d_in = 0, *d_out = 0;

	cudaMalloc(&d_in, n * sizeof(float));
	cudaMalloc(&d_out, n * sizeof(float));
	cudaMemcpy(d_in, in, n * sizeof(float), cudaMemcpyHostToDevice);

	ddKernel << <(n + TPB - 1) / TPB, TPB. >> > (d_out, d_in, n, h);
	cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);
	cudaFree(d_in);
	cudaFree(d_out);
}

main.cpp

#include "kernel.h"
#include <math.h>
#include <stdio.h>

using namespace std;

int main()
{
	const float PI = 3.1415927;
	const int N = 150;
	const float h = 2 * PI / N;

	float x[N] = { 0.0 };
	float u[N] = { 0.0 };
	float result_paraller[N] = { 0.0 };

	for (int i = 0; i<N; i++)
	{
		x[i] = 2 * PI*i / N;
		u[i] = sinf(x[i]);
	}

	ddParallel(result_paraller, u, N, h);

	FILE *outfile = fopen("result.csv", "w");
	for (int i =1; i<N-1; ++i)
	{
		fprintf(outfile, "%f,%f,%f,%f\n", x[i], u[i], result_paraller[i], result_paraller[i] + u[i]);
	}
	fclose(outfile);

	return 0;
}

利用共享内存进行优化：（只对kernel.cu进行了部分修改）


#include "kernel.h"
#define TPB 64
#define RAD 1//模板半径 （线程数）

__global__ void ddKernel(float *d_out, const float *d_in, int size, float h)
{
	const int i = threadIdx.x*blockDim.x + threadIdx.x;
	if (i >= size) return;

	const int s_idx = threadIdx.x + RAD;//局部索引和线程索引之间的关系
	extern __shared__ float s_in[];//共享内存

	s_in[s_idx] = d_in[i];

	if (threadIdx.x<RAD)
	{
		s_in[s_idx - RAD] = d_in[i - RAD];
		s_in[s_idx + blockDim.x] = d_in[i + blockDim.x];
	}
	__syncthreads();//线程同步 强制线程块中所有线程在本线程块中任意其他线程进一步执行之前完成之前的语句。
	d_out[i] = (s_in[s_idx - 1] - 2.f*s_in[s_idx] + s_in[s_idx + 1]) / (h*h);
	//d_out[i] = (d_in[i - 1] - 2.f*d_in[i] + d_in[i + 1]) / (h*h);

}

void ddParallel(float *out, const float *in, int n, float h)
{
	float *d_in = 0, *d_out = 0;

	cudaMalloc(&d_in, n * sizeof(float));
	cudaMalloc(&d_out, n * sizeof(float));
	cudaMemcpy(d_in, in, n * sizeof(float), cudaMemcpyHostToDevice);

	const size_t smemSize = (TPB + 2 * RAD) * sizeof(float);//修改

	ddKernel << <(n + TPB - 1) / TPB, TPB,smemSize >> > (d_out, d_in, n, h);//修改
	cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);
	cudaFree(d_in);
	cudaFree(d_out);
}

JennyBi

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
5.2 一维网格上的导数计算

kernel.h#pragma once#ifndef KERNEL_H#define KERNEL_Hvoid ddParallel(float *out, const float *in, int n, float h);#endif // !KERNEL_Hkernel.cu#include "kernel.h"#define TPB 64__global...
复制链接

扫一扫

专栏目录