参照CUDA Pro Tip: Increase Performance with Vectorized Memory Access
我试了试,如下(算了,不写了,测试一堆数据,写上来麻烦。)
这篇文章主要讲的是 vectorized load-向量化加载(我认为和CPU上的SIMD是一个原理,一次加载多个数据),可以通过CUDA提供的内建变量,如int2, int4, float2等实现向量加载。
未优化前代码如下
__global__ void device_copy_scalar_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
d_out[i] = d_in[i];
}
}
void device_copy_scalar(int* d_in, int* d_out, int N)
{
int threads = 128;
int blocks = min((N + threads-1) / threads, MAX_BLOCKS);
device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
优化后的代码版本1
__global__ void device_copy_vector2_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N/2; i += blockDim.x * gridDim.x) {
reinterpret_cast<int2*>(d_out)[i] = reinterpret_cast<int2*>(d_in)[i];
}
//process remaining elements
for(int i = idx + N/2 * 2; i<N; i += blockIdx.x*blockDim.x+threadIdx.x)
d_out[i] = d_in[i];
}
void device_copy_vector2(int* d_in, int* d_out, int n) {
threads = 128;
blocks = min((N/2 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
优化后的代码版本2
__global__ void device_copy_vector4_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
reinterpret_cast<int4*>(d_out)[i] = reinterpret_cast<int4*>(d_in)[i];
}
//process remaining elements
for(int i = idx + N/4 * 4; i<N; i += blockIdx.x*blockDim.x+threadIdx.x)
d_out[i] = d_in[i];
}
void device_copy_vector4(int* d_in, int* d_out, int N) {
int threads = 128;
int blocks = min((N/4 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
可以通过cuobjdump来查看sass代码,得知其不同版本加载使用指令的变化。