java对矩阵标准化,如何使CUDA中的矩阵列标准化并获得最大性能?

我将M2090的3种方法的性能与CUDA 5.0进行了比较 .

[173.179 us] cublas实现如问题所示

[733.734 us]来自@talonmies的 thrust::reduce_by_key 的纯粹推力实施

[1.508 ms]纯推力实现 thrust::inclusive_scan_by_key

HSePu.png

可以看出,

cublas在这种情况下表现最好;

thrust::reduce_by_key & thrust::inclusive_scan_by_key 启动多个内核,这会导致额外的开销;

与 thrust::reduce_by_key 相比,

thrust::inclusive_scan_by_key 向DRAM写入更多数据,这可能是内核时间更长的原因之一;

cublas和推力方法的主要性能差异是矩阵列求和 . 推力较慢可能是因为 thrust::reduce_by_key 旨在减少具有变体长度的段,但 cublas_gemv() 只能应用于固定长度的段(行/列) .

当矩阵A足够大以忽略内核启动开销时,cublas appoach仍然表现最佳 . A_ {20,000 x 2,000}的分析结果如下所示 .

o3VoU.png

将第一个 for_each 操作与@talonmies指示的 cublasSgemv 调用融合可能会进一步提高性能,但我认为应该使用手写的内核而不是 thrust::reduce_by_key .

3种方法的代码如下所示 .

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

struct Exp: public thrust::unary_function

{

__host__ __device__ double operator()(double x)

{

return exp(x);

}

};

struct Inv: public thrust::unary_function

{

__host__ __device__ double operator()(double x)

{

return (double) 1.0 / x;

}

};

template

struct MulC: public thrust::unary_function

{

T C;

__host__ __device__ MulC(T c) :

C(c)

{

}

__host__ __device__ T operator()(T x)

{

return x * C;

}

};

template

struct line2col: public thrust::unary_function

{

T C;

__host__ __device__ line2col(T C) :

C(C)

{

}

__host__ __device__ T operator()(T i)

{

return i / C;

}

};

int main()

{

cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);

cublasHandle_t hd;

curandGenerator_t rng;

cublasCreate(&hd);

curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

const size_t m = 2000, n = 200;

const double c1 = 1.0;

const double c0 = 0.0;

thrust::device_vector A(m * n);

thrust::device_vector B(m * n);

thrust::device_vector C(m * n);

thrust::device_vector sum1(1 * n);

thrust::device_vector sum2(1 * n);

thrust::device_vector one(m * n, 1);

double* pA = thrust::raw_pointer_cast(&A[0]);

double* pB = thrust::raw_pointer_cast(&B[0]);

double* pSum1 = thrust::raw_pointer_cast(&sum1[0]);

double* pSum2 = thrust::raw_pointer_cast(&sum2[0]);

double* pOne = thrust::raw_pointer_cast(&one[0]);

curandGenerateUniformDouble(rng, pA, A.size());

const int count = 2;

for (int i = 0; i < count; i++)

{

thrust::transform(A.begin(), A.end(), B.begin(), Exp());

cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1);

thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv());

cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m);

}

for (int i = 0; i < count; i++)

{

thrust::reduce_by_key(

thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)),

thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)) + A.size(),

thrust::make_transform_iterator(A.begin(), Exp()),

thrust::make_discard_iterator(),

sum2.begin());

thrust::transform(

A.begin(), A.end(),

thrust::make_permutation_iterator(

sum2.begin(),

thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m))),

C.begin(),

thrust::divides());

}

for (int i = 0; i < count; i++)

{

thrust::inclusive_scan_by_key(

thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)),

thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m)) + A.size(),

thrust::make_transform_iterator(A.begin(), Exp()),

C.begin());

thrust::copy(

thrust::make_permutation_iterator(

C.begin() + m - 1,

thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC(m))),

thrust::make_permutation_iterator(

C.begin() + m - 1,

thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC(m))) + n,

sum2.begin());

thrust::transform(

A.begin(), A.end(),

thrust::make_permutation_iterator(

sum2.begin(),

thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col(m))),

C.begin(),

thrust::divides());

}

curandDestroyGenerator(rng);

cublasDestroy(hd);

return 0;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值