我将M2090的3种方法的性能与CUDA 5.0进行了比较 .
[173.179 us] cublas实现如问题所示
[733.734 us]来自@talonmies的 thrust::reduce_by_key 的纯粹推力实施
[1.508 ms]纯推力实现 thrust::inclusive_scan_by_key
可以看出,
cublas在这种情况下表现最好;
thrust::reduce_by_key & thrust::inclusive_scan_by_key 启动多个内核,这会导致额外的开销;
与 thrust::reduce_by_key 相比,
thrust::inclusive_scan_by_key 向DRAM写入更多数据,这可能是内核时间更长的原因之一;
cublas和推力方法的主要性能差异是矩阵列求和 . 推力较慢可能是因为 thrust::reduce_by_key 旨在减少具有变体长度的段,但 cublas_gemv() 只能应用于固定长度的段(行/列) .
当矩阵A足够大以忽略内核启动开销时,cublas appoach仍然表现最佳 . A_ {20,000 x 2,000}的分析结果如下所示 .
将第一个 for_each 操作与@talonmies指示的 cublasSgemv 调用融合可能会进一步提高性能,但我认为应该使用手写的内核而不是 thrust::reduce_by_key .
3种方法的代码如下所示 .
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
struct Exp: public thrust::unary_function
{
__host__ __device__ double operator()(double x)
{
return exp(x);
}
};
struct Inv: public thrust::unary_function
{
__host__ __device__ double operator()(double x)
{
return (double) 1.0 / x;
}
};
template
struct MulC: public thrust::unary_function
{
T C;
__host__ __device__ MulC(T c) :
C(c)
{
}
__host__ __device__ T operator()(T x)
{
return x * C;
}
};
template
struct line2col: public thrust::unary_function
{
T C;
__host__ __device__ line2col(T C) :
C(C)
{
}
__host__ __device__ T operator()(T i)
{
return i / C;
}
};
int main()
{
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
cublasHandle_t hd;
curandGenerator_t rng;
cublasCreate(&hd);
curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);
const size_t m = 2000, n = 200;
const double c1 = 1.0;
const double c0 = 0.0;
thrust::device_vector A(m * n);
thrust::device_vector B(m * n);
thrust::device_vector C(m * n);
thrust::device_vector sum1(1 * n);
thrust::device_vector sum2(1 * n);
thrust::device_vector one(m * n, 1);
double* pA = thrust::raw_pointer_cast(&A[0]);
double* pB = thrust::raw_pointer_cast(&B[0]);
double* pSum1 = thrust::raw_pointer_cast(&sum1[0]);
double* pSum2 = thrust::raw_pointer_cast(&sum2[0]);
double* pOne = thrust::raw_pointer_cast(&one[0]);
curandGenerateUniformDouble(rng, pA, A.size());
const int count = 2;
for (int i = 0; i < count; i++)
{
thrust::transform(A.begin(), A.end(), B.begin(), Exp());
cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1);
thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv());
cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m);
}
for (int i = 0; i < count; i++)
{
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)) + A.size(),
thrust::make_transform_iterator(A.begin(), Exp()),
thrust::make_discard_iterator(),
sum2.begin());
thrust::transform(
A.begin(), A.end(),
thrust::make_permutation_iterator(
sum2.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m))),
C.begin(),
thrust::divides());
}
for (int i = 0; i < count; i++)
{
thrust::inclusive_scan_by_key(
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)) + A.size(),
thrust::make_transform_iterator(A.begin(), Exp()),
C.begin());
thrust::copy(
thrust::make_permutation_iterator(
C.begin() + m - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC(m))),
thrust::make_permutation_iterator(
C.begin() + m - 1,
thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC(m))) + n,
sum2.begin());
thrust::transform(
A.begin(), A.end(),
thrust::make_permutation_iterator(
sum2.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m))),
C.begin(),
thrust::divides());
}
curandDestroyGenerator(rng);
cublasDestroy(hd);
return 0;
}