以前,我实现了这样的代码:
// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h
#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H
#include "vector_functions.h"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator)
{
return __fdividef(numerator, denominator);
//return numerator/denominator;
}
__device__
float getSqrtf(float f2)
{
return sqrtf(f2);
}
__device__
float getReverseSqrt(float f2)
{
return rsqrtf(f2);
}
__device__
float3 getCrossProduct(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
__device__
float4 getCrossProduct(float4 a, float4 b)
{
float3 v1 = make_float3(a.x, a.y, a.z);
float3 v2 = make_float3(b.x, b.y, b.z);
float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
return make_float4(v3.x, v3.y, v3.z, 0.0f);
}
__device__
float getDotProduct(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
float getDotProduct(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
__device__ float3 getNormalizedVec(const float3 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}
__device__ float4 getNormalizedVec(const float4 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}
__device__
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.x, a.y, a.z,0.f);
float4 b1 = make_float4(b.x, b.y, b.z,0.f);
return getDotProduct(a1, b1);
}
__device__
float getLength(float3 a)
{
return sqrtf(getDotProduct(a, a))