采用CUDA Thrust实现的标准二体模型

最新推荐文章于 2024-07-10 16:21:46 发布

eyepeak

最新推荐文章于 2024-07-10 16:21:46 发布

阅读量1.6k

点赞数 1

文章标签： cuda 算法 c++

本文链接：https://blog.csdn.net/wangrongfeng/article/details/122315445

版权

采用CUDA Thrust实现的标准二体模型

基于CUDA的空间目标轨道并行计算技术，共四节，其中第一、二、三节目录如下

1 基于CUDA的空间目标轨道计算需求与任务分析

2 基于CUDA的空间坐标系变换矩阵计算
2.1 基于SOFA的空间坐标系变换
2.2 采用CUDA Thrust实现的空间坐标系变换
2.3 采用CUDA Runtime实现的空间坐标系变换**

3 基于CUDA的标准二体模型轨道预推算法
3.1 空间目标二体运动模型及其CPU实现
3.2 采用CUDA Thrust实现的标准二体模型
3.3 采用CUDA Runtime实现的标准二体模型

一、直接实现及其效率分析

采用Thrust实现二体模型，首先采用的是直接实现。Thrust仿函数定义如下

struct twoBodyModelTransFunctor_Thrust_1{
	double		_jdOfBt2Epo;	//开始时间与星历时间的差距
	OrbitElement _orbitElement;
	int		_stepMilliSecond;
	__device__ pvOfOrbit operator()(int& i, mat3x3& c2tmat)const {
		double timeOfCycle=PI*2/(sqrt(MIU_EARTH/_semimajorAxis)/_semimajorAxis);
		double rmat[3][3];
		iauIr(rmat);
		iauRz(-_orbitElement._RAAN*PI / 180.0, rmat);
		iauRx(-_orbitElement._inclination*PI / 180.0, rmat);
		iauRz(-_orbitElement._argumentOfPerigee*PI / 180.0, rmat);
		double	t = (double)_stepMilliSecond*i / 86400.0 / 1000.0 + _jdOfBt2Epo;
		t = fmod(t, timeOfCycle);
		t *= 86400;
		double M=t*sqrt(MIU_EARTH/_semimajorAxis)/_semimajorAxis;
		M = fmod(M, PI * 2);
		double	radM = _meanAnomaly*PI / 180 + M;	//加上初始值
		double E0 = radM;
		double E1;
		while (1){
			E1 = radM + _orbitElement._eccentricity * sin(E0);
			if (fabs(E1 - E0) < 1e-10)break;
			E0 = E1;}
		E1 = fmod(E1, PI * 2);
		double f = 2 * atan((sqrt((1 + _eccentricity) / (1 - _eccentricity))) *tan(E1 / 2));
		double  tmpcosf = cos(f);
		double r=_semimajorAxis*(1-_eccentricity*_eccentricity)/(1+_eccentricity*tmpcosf);
		double h = sqrt(MIU_EARTH*(r + r * _eccentricity*tmpcosf));
		double vt = h / r;   //速度切向分量
		double vr = MIU_EARTH / h * _eccentricity*sin(f); //速度径向分量
		double pt[3];
		pt[0] = r, pt[1] = 0.0, pt[2] = 0.0;
		iauRxp(mat, pt, ECIp);
		iauRxp(c2tmat._mat, ECIp, pv._ECFp._vec);
		return pv;};};

函数的输入是序号和变换矩阵，这两者是预先生成数组，并在调用thrust::transform时作为参数传递。对于每个线程，将提供对应数据作为参数。输出则返回计算结果数据结构，包括惯性系和地固系的位置矢量、速度矢量，共4个矢量（上述代码中均省略了速度矢量计算部分）。调用仿函数的代码，省略。

其效率如表

采样点数	CPU版本耗时（s）	Thrust版本耗时（s）
14400	0.011	0.002
86400	0.099	0.016
864000	0.586	0.098
864000*5	2.798	0.509

CUDA版本耗时不到CPU版本的20%。其效率有所提升，但并没有空间坐标系变换那么明显。

二、优化

下面在直接Thrust版本的基础上进行优化。主要优化点是轨道坐标系到惯性坐标系的转换矩阵的计算，此矩阵对于所有的采样点都是一样的，因此在CPU端进行计算，将计算结果直接传递到GPU（Thrust仿函数中的成员变量，应该都是存储在constant 内存的），而在CUDA线程中，不再进行计算。

优化后的仿函数代码为

struct twoBodyModelTransFunctor_Thrust_3{
	double		_jdOfBt2Epo;	//开始时间与星历时间的差距
	OrbitElement _orbitElement;
	int		_stepMilliSecond;
	double	_rmat[3][3];
	__device__ pvOfOrbit operator()(int& i, mat3x3& c2tmat)const {
		double timeOfCycle=PI*2/(sqrt(MIU_EARTH /_semimajorAxis)/_semimajorAxis);
		double	t = (double)_stepMilliSecond*i / 86400.0 / 1000.0 + _jdOfBt2Epo;
		t = fmod(t, timeOfCycle);
		与上一段代码相同;};};

代码与上一版代码的区别就在于多了一个变换矩阵_rmat[3][3]，即轨道坐标系到惯性坐标系的变换矩阵，不再是每个线程计算了。

与之对应的是调用仿函数的代码中，需要完成轨道坐标系到惯性坐标系变换矩阵的计算。调用仿函数的代码如下。

int		propagateOrbitWithTwoBody_Thrust_3(OrbitElement eo, const timeOfSpace& ept, const timeOfSpace& t0, int step, device_vector<mat3x3>& d_mats, device_vector<pvOfOrbit>& d_pvs)	{
	twoBodyModelTransFunctor_Thrust_3 functor;
	double jd00, jd01, jd10, jd11;
	iauCal2jd(t0._year, t0._month, t0._day, &jd10, &jd11);
	double tmp = (jd10 + jd11);
	tmp+=double((60*(60*t0._hour+t0._minute)+t0._second)*1000+t0._millisecond)/86400;
	iauCal2jd(ept._year, ept._month, ept._day, &jd00, &jd01);
	double tmp1 = jd00 + jd01;
	tmp1+=double((60*(60*ept._hour+ept._minute)+ept._second)*1000+ept._millisecond)/86400;
	functor._jdOfBt2Epo = tmp - tmp1;
	double rmat[3][3];
	iauIr(rmat);
	iauRz(-eo._RAAN*PI / 180.0, rmat);
	iauRx(-eo._inclination*PI / 180.0, rmat);
	iauRz(-eo._argumentOfPerigee*PI / 180.0, rmat);
	memcpy(functor._rmat,rmat,9*sizeof(double)) ;
	functor._orbitElement = eo;
	functor._stepMilliSecond = stepMilliSecond;
	d_pvs.resize(d_mats.size());
	thrust::device_vector<int> d_tms(d_mats.size());
	thrust::sequence(d_tms.begin(), d_tms.end(), 0, 1);
	thrust::transform(d_tms.begin(), d_tms.end(), d_mats.begin(), d_pvs.begin(), functor);}