5.2 一维网格上的导数计算

kernel.h

#pragma once
#ifndef KERNEL_H
#define KERNEL_H

void ddParallel(float *out, const float *in, int n, float h);

#endif // !KERNEL_H

kernel.cu

#include "kernel.h"
#define TPB 64

__global__ void ddKernel(float *d_out, const float *d_in, int size, float h)
{
	const int i = threadIdx.x*blockDim.x + threadIdx.x;
	if (i >= size) return;
	d_out[i] = (d_in[i - 1] - 2.f*d_in[i] + d_in[i + 1]) / (h*h);

}

void ddParallel(float *out, const float *in, int n, float h)
{
	float *d_in = 0, *d_out = 0;

	cudaMalloc(&d_in, n * sizeof(float));
	cudaMalloc(&d_out, n * sizeof(float));
	cudaMemcpy(d_in, in, n * sizeof(float), cudaMemcpyHostToDevice);

	ddKernel << <(n + TPB - 1) / TPB, TPB. >> > (d_out, d_in, n, h);
	cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);
	cudaFree(d_in);
	cudaFree(d_out);
}

main.cpp

#include "kernel.h"
#include <math.h>
#include <stdio.h>

using namespace std;

int main()
{
	const float PI = 3.1415927;
	const int N = 150;
	const float h = 2 * PI / N;

	float x[N] = { 0.0 };
	float u[N] = { 0.0 };
	float result_paraller[N] = { 0.0 };

	for (int i = 0; i<N; i++)
	{
		x[i] = 2 * PI*i / N;
		u[i] = sinf(x[i]);
	}

	ddParallel(result_paraller, u, N, h);

	FILE *outfile = fopen("result.csv", "w");
	for (int i =1; i<N-1; ++i)
	{
		fprintf(outfile, "%f,%f,%f,%f\n", x[i], u[i], result_paraller[i], result_paraller[i] + u[i]);
	}
	fclose(outfile);

	return 0;
}

利用共享内存进行优化:(只对kernel.cu进行了部分修改)


#include "kernel.h"
#define TPB 64
#define RAD 1//模板半径 (线程数)

__global__ void ddKernel(float *d_out, const float *d_in, int size, float h)
{
	const int i = threadIdx.x*blockDim.x + threadIdx.x;
	if (i >= size) return;

	const int s_idx = threadIdx.x + RAD;//局部索引和线程索引之间的关系
	extern __shared__ float s_in[];//共享内存

	s_in[s_idx] = d_in[i];

	if (threadIdx.x<RAD)
	{
		s_in[s_idx - RAD] = d_in[i - RAD];
		s_in[s_idx + blockDim.x] = d_in[i + blockDim.x];
	}
	__syncthreads();//线程同步 强制线程块中所有线程在本线程块中任意其他线程进一步执行之前完成之前的语句。
	d_out[i] = (s_in[s_idx - 1] - 2.f*s_in[s_idx] + s_in[s_idx + 1]) / (h*h);
	//d_out[i] = (d_in[i - 1] - 2.f*d_in[i] + d_in[i + 1]) / (h*h);

}

void ddParallel(float *out, const float *in, int n, float h)
{
	float *d_in = 0, *d_out = 0;

	cudaMalloc(&d_in, n * sizeof(float));
	cudaMalloc(&d_out, n * sizeof(float));
	cudaMemcpy(d_in, in, n * sizeof(float), cudaMemcpyHostToDevice);

	const size_t smemSize = (TPB + 2 * RAD) * sizeof(float);//修改

	ddKernel << <(n + TPB - 1) / TPB, TPB,smemSize >> > (d_out, d_in, n, h);//修改
	cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);
	cudaFree(d_in);
	cudaFree(d_out);
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值