CUDA-纹理内存-3d

纹理内存和表面内存(surface memory)实质上是全局内存的一个特殊形态,全局内存被绑定为纹理内存(表面内存),对其的读(写)操作将通过专门的texture cache(纹理缓存)进行,其实称为纹理缓存更加贴切。

纹理缓存的优势:纹理缓存具备硬件插值特性,可以实现最近邻插值和线性插值。纹理缓存针对二维空间的局部性访问进行了优化,所以通过纹理缓存访问二维矩阵的邻域会获得加速。纹理缓存不需要满足全局内存的合并访问条件。

纹理的创建通过texture object或者texture reference,注意只有计算能力3.0以上的设备支持texture object。
texture object在运行时创建,texture reference在编译期创建,运行时绑定到实际的纹理。

texture reference必须定义为全局变量,不能作为函数的参数。

纹理可以是一段连续的设备内存,也可以是一个CUDA数组。但是CUDA数组对局部寻址有优化,称为“块线性”,原理是将邻域元素缓存在同一条cache线上,这将加快邻域内的寻址,但是对于设备内存,并没有“块线性”。所以,选择采用CUDA数组,还是设备内存,需要根据实际情况决定,将数据copy至CUDA数组是很耗时的。

纹理的一个元素称为texels。

纹理最多支持三维,width,x方向的维数;height,y方向的维数;depth,z方向的维数

read mode:(1)归一化浮点模式,fetch将返回【0.0,1.0】的归一化之后的数值,输入数据类型为8位、16位整型或浮点数;(2)元素类型模式,fetch返回原始数值

纹理支持归一化的浮点坐标,计算方式如下:[0.0,1-1/N],N为当前维度的维数。归一化坐标在某些情况下非常方便,如放大缩小操作.

addressing mode:寻找模式,定义超出坐标范围的取值(越界情况)。一共四种模式:cudaAddressModeBorder, cudaAddressModeClamp(默认模式), cudaAddressModeWrap, and cudaAddressModeMirror,后两个只能在归一化坐标时使用。addressing mode定义为一个三维向量,分别代表纹理各个方向上的寻址模式

cudaAddressModeClamp:超出范围就用边界值代替,示意: AA | ABCDE | EE
cudaAddressModeBorder:超出范围就用零代替,示意: 00 | ABCDE | 00
cudaAddressModeWrap:重叠模式(循环),示意: DE | ABCDE || AB
cudaAddressModeMirror:镜像模式,示意: BA | ABCDE | ED

这里主要说的是filtering mode:滤波模式,定义了fetch返回结果的计算方式。有两种模式:cudaFilterModePoint or cudaFilterModeLinear。

cudaFilterModePoint:点模式,返回取floor之后的点。
cudaFilterModeLinear:线性模式,即线性插值;对于一维纹理,两点插值;对于二维纹理,四点插值;对于三维纹理,八点插值。线性模式只有在fetch返回浮点类型数据(注意并非指read mode的归一化浮点模式)下才有效。值得注意的是,线性模式时,插值用的点实际坐标在输入tex3D/tex2D的坐标基础上减去0.5

在以下代码上验证:

//main.cpp
#include <iostream>
#include <fstream>
#include <string>
#include <io.h>
#include <direct.h>
#include <sstream>
#include <windows.h>

//#include "testTexture.cu"

#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\cuda_runtime.h"
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\device_launch_parameters.h"

using namespace std;

extern "C" int runCopyKernel(float* pDevImg, float* pData, int voxelX, int voxelY, int voxelZ);

int main(int argc, char* argv[])
{
    int voxelX = 100;  // width
    int voxelY = 120;  // height
    int voxelZ = 140;  // depth
    float* pData = new float[voxelX * voxelY * voxelZ];
    for(int iNum = 0; iNum < voxelX * voxelY * voxelZ; iNum++)
    {
        pData[iNum] = (float)iNum;
    }
    float *pDevImg;
    cudaMalloc((void**)&pDevImg, voxelX * voxelY * voxelZ * sizeof(float));

    runCopyKernel(pDevImg, pData, voxelX, voxelY, voxelZ);

    // copy divice to host
    float* fHost = new float[voxelX * voxelY * voxelZ];
    cudaMemcpy((void*)fHost, (void*)pDevImg, voxelX * voxelY * voxelZ * sizeof(float), cudaMemcpyDeviceToHost);

    std::ofstream fout("E:/test/textureTest.raw",std::ios::out|std::ios::binary|std::ios::ate);
    fout.write((char *)fHost, voxelX * voxelY * voxelZ * sizeof(float));
    fout.close();

    cudaFree(pDevImg);

    return 0;
}

//testTexture.cu
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\cuda_texture_types.h"
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\cuda_runtime.h"
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\texture_types.h"
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\device_launch_parameters.h"
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\cuda.h"


texture<float, cudaTextureType3D, cudaReadModeElementType> texDensity;

__global__ void  copyTexToDevice(float* pDevImg, int width, int height, int depth)
{
    int iVoxel = blockIdx.x * blockDim.x + threadIdx.x;
    int jVoxel = blockIdx.y * blockDim.y + threadIdx.y;
    int kVoxel = blockIdx.z * blockDim.z + threadIdx.z;

    if (jVoxel >= height  || iVoxel >= width || kVoxel >= depth)
    {
        return;
    }

    float fPosX = (float)iVoxel + 0.50f;
    float fPosY = (float)jVoxel + 0.50f;
    float fPosZ = (float)kVoxel + 0.50f;
	
    int indexO = kVoxel * width * height + jVoxel * width + iVoxel;
    pDevImg[indexO] = tex3D(texDensity, fPosX, fPosY, fPosZ);
}

extern "C" int runCopyKernel(float* pDevImg, float* pData, int width, int height, int depth)
{
	// 初始化
    cudaExtent volumeSize = make_cudaExtent(width, height, depth);
    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc<float>();
    cudaArray *d_volumeArray = 0;
    cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize);

    // copy data to 3D array
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr = make_cudaPitchedPtr((void*)pData, volumeSize.width * sizeof(float), volumeSize.width,volumeSize.height);
    copyParams.dstArray = d_volumeArray;
    copyParams.extent = volumeSize;
    copyParams.kind = cudaMemcpyHostToDevice;
    cudaMemcpy3D(&copyParams);

    // set texture parameters
    texDensity.normalized = false;
    texDensity.filterMode = cudaFilterModeLinear;  //cudaFilterModeLinear // cudaFilterModePoint
    texDensity.addressMode[0] = cudaAddressModeBorder;
    texDensity.addressMode[1] = cudaAddressModeBorder;

    cudaBindTextureToArray(texDensity, d_volumeArray, channelDesc);

	int blockSizeX = 4;
    int blockSizeY = 1;
    int blockSizeZ = 4;
    int gridSizeX = (width + blockSizeX - 1) / blockSizeX;
    int gridSizeY = (height + blockSizeY - 1) / blockSizeY;
    int gridSizeZ = (depth + blockSizeZ - 1) / blockSizeZ;
    dim3 threadsPerBlock(blockSizeX, blockSizeY, blockSizeZ);
    dim3 blocksPerGrid(gridSizeX, gridSizeY, gridSizeZ);

    copyTexToDevice<<<blocksPerGrid, threadsPerBlock>>>(pDevImg, width, height, depth);

    cudaUnbindTexture(texDensity);

	return 0;
}

在以上代码中,make_cudaPitchedPtr将新建cudaPitchedPtr并将pData上的内容保存在cudaPitchedPtr上,保存的顺序为width, height, depth, 。

// main.cpp
#include <string>
#include <io.h>
#include <direct.h>
#include <sstream>
#include <windows.h>

#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\cuda_runtime.h"
#include "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\device_launch_parameters.h"

using namespace std;

//extern "C" int runCopyKernel(float* pDevImg, float* pData, int voxelX, int voxelY, int voxelZ);

int main(int argc, char* argv[])
{
    int voxelX = 100;  // width
    int voxelY = 100;  // depth
    int voxelZ = 100;  // height
    float* pData = new float[voxelX * voxelY * voxelZ];
    for(int iNum = 0; iNum < voxelX * voxelY * voxelZ; iNum++)
    {
        pData[iNum] = (float)iNum;
    }
    float* fHost = new float[voxelX * voxelY * voxelZ];

    cudaPitchedPtr devPitchPtr;
    cudaExtent extentObj = make_cudaExtent(voxelX * sizeof(float), voxelZ, voxelY);
    cudaMalloc3D(&devPitchPtr, extentObj);
    cudaMemset3D(devPitchPtr, 0, extentObj);

    // 初始化
    cudaExtent volumeSize = make_cudaExtent(voxelX , voxelZ, voxelY);

    // copy data to 3D array
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr = make_cudaPitchedPtr((void*)pData, volumeSize.width * sizeof(float), volumeSize.width,volumeSize.height);
    copyParams.dstPtr = devPitchPtr;
    copyParams.extent = extentObj;
    copyParams.kind = cudaMemcpyHostToDevice;
    cudaMemcpy3D(&copyParams);

    // copy divice to host
    cudaMemcpy3DParms DevToHost = { 0 };
    DevToHost.srcPtr = devPitchPtr;
    DevToHost.dstPtr = make_cudaPitchedPtr((void*)fHost, volumeSize.width * sizeof(float), volumeSize.width, volumeSize.height);
    DevToHost.extent = extentObj;
    DevToHost.kind = cudaMemcpyDeviceToHost;
    cudaMemcpy3D(&DevToHost);


    std::ofstream fout("E:/test/textureTest.raw",std::ios::out|std::ios::binary|std::ios::ate);
    fout.write((char *)fHost, voxelX * voxelY * voxelZ * sizeof(float));
    fout.close();

    cudaFree(&devPitchPtr);

    return 0;
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值