size_t fea_pitch;
texture<unsigned char, 2> features2D;
cudaMallocPitch((void**)(&dev_features), &fea_pitch, sizeof(unsigned char) * sfeaturesw, sfeaturesh);
cudaChannelFormatDesc feaDesc = cudaCreateChannelDesc<unsigned char>();
cudaMemcpy2D(dev_features, fea_pitch, sfeatures, sizeof(unsigned char) * sfeaturesw, sizeof(unsigned char) * sfeaturesw, sfeaturesh, cudaMemcpyHostToDevice);
cudaBindTexture2D(NULL, features2D, dev_features, feaDesc, sfeaturesw, sfeaturesh, fea_pitch);//绑定
纹理拾取(读取)的步骤
point1=tex2D(imageData2D,box_x+x1,box_y+y1);//第y1行,第x1列,cpu版即这个意思,此处尤须注意,和一般数据结构不同
cudaMallocPitch((void**)(&dev_features), &fea_pitch, sizeof(unsigned char) * sfeaturesw, sfeaturesh);
cudaChannelFormatDesc feaDesc = cudaCreateChannelDesc<unsigned char>();
cudaMemcpy2D(dev_features, fea_pitch, sfeatures, sizeof(unsigned char) * sfeaturesw, sizeof(unsigned char) * sfeaturesw, sfeaturesh, cudaMemcpyHostToDevice);
cudaBindTexture2D(NULL, features2D, dev_features, feaDesc, sfeaturesw, sfeaturesh, fea_pitch);
--------------------------------------------------------------------------------
int sfeatures_size = sizeof(unsigned char) * sfeaturesw * sfeaturesh;
cudaChannelFormatDesc chDesc2 = cudaCreateChannelDesc<unsigned char>();
cudaMallocArray(&featuresArray, &chDesc2, sfeaturesw, sfeaturesh);
cudaMemcpyToArray( featuresArray, 0, 0, sfeatures, sfeatures_size, cudaMemcpyHostToDevice );
cudaBindTextureToArray( features2D, featuresArray);
-------------------------------------------------------------------------------------
int grid_data_size = sizeof(float) * gridl;
cudaMalloc((void**)&dev_grid,grid_data_size);
cudaMemcpy(dev_grid,sgrid,grid_data_size,cudaMemcpyHostToDevice);
cudaBindTexture(0,gridData1D,dev_grid);
对于一维纹理,不管是Linear Memory还是使用cudaMallocPitch的,都可以使用tex1Dfetch和tex1D
而对于二维纹理,不管是cudaArray还是cudaMallocPitch都是使用tex2D
下面是关于cudaMemcpy2D和cudaMallocPitch两个函数的参数和用法
最近学习了下CUDA矩阵内存对齐分配的方法,主要是cudaMemcpy2D和cudaMallocPitch两个函数的用法,先看看cudalibrary中如何定义的这两个函数:
cudaError_t | ( | void ** | devPtr, | |
size_t * | pitch, | |||
size_t | width, | |||
size_t | height | | ||
) |
Allocates at least widthInBytes
height
*devPtr
*pitch
pitch
T
, the address is computed as:
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
For allocations of 2D arrays, it is recommended that programmers consider performing pitch allocations using
-
Parameters:
-
devPtr - Pointer to allocated pitched device memory pitch - Pitch for allocation width - Requested pitched allocation width height - Requested pitched allocation height
cudaError_t | ( | void * | dst, | |
size_t | dpitch, | |||
const void * | src, | |||
size_t | spitch, | |||
size_t | width, | |||
size_t | height, | |||
enum | kind | | ||
) |
Copies a matrix (height
width
src
dst
, where kind
dpitch
spitch
dst
src
, including any padding added to the end of each row. The memory areas may not overlap. Calling dst
src
dpitch
spitch
-
Parameters:
-
dst - Destination memory address dpitch - Pitch of destination memory src - Source memory address spitch - Pitch of source memory width - Width of matrix transfer (columns in bytes) height - Height of matrix transfer (rows) kind - Type of transfer
由此,可以对这两个函数有个充分的认识。此外,cudaMallocPitch和cudaMemcpy2D,一般用于二维数组各维度size不是2的幂次方的问题。使用cudaMallocPitch()那么该数组的对齐、大小、起始址等就自动做好了,其返回的pitch就是真正分配给数组的size(往往大于其真正申请的大小)。
PS:
patch的理解:
C语言申请2维内存时,一般是连续存放的。a[y][x]存放在第y*widthofx*sizeof(元素)+x*sizeof(元素)个字节。但在cuda的global memory访问中,从256字节对齐的地址(addr=0, 256, 512, ...)开始的连续访问是最有效率的。 这样,为了提高内存访问的效率,有了cudaMallocPitch函数。 cudaMallocPitch函数分配的内存中,数组的每一行的第一个元素的开始地址都保证是对齐的。因为每行有多少个数据是不确定的widthofx*sizeof(元素)不一定是256的倍数。故此,为保证数组的每一行的第一个元素的开始地址对齐,cudaMallocPitch在分配内存时,每行会多分配一些字节,以保证widthofx*sizeof(元素)+多分配的字节是256的倍数(对齐)。这样,y*widthofx*sizeof(元素)+x*sizeof(元素)来计算a[y][x]的地址就不正确了。而应该是y*[widthofx*sizeof(元素)+多分配的字节]+x*sizeof(元素)。而函数中返回的pitch的值就是widthofx*sizeof(元素)+多分配的字节。
一、内存对齐的原因
大部分的参考资料都是如是说的:
1、平台原因(移植原因):不是所有的硬件平台都能访问任意地址上的任意数据 的;某些硬件平台只能在某些地址处取某些特定类型的数据,否则抛出硬件异常。
2、性能原因:数据结构(尤其是栈)应该尽可能地在自然边界上对齐。 原因在于,为了访问未对齐的内存,处理器需要作两次内存访问;而对齐的内存访问仅需要一次访问。