Intel MediaSDK sample_decode 官方GPU解码流程学习 - DirectX11 D3D11和OpenCL共享资源

最近一直在研究GPU全栈处理的方法。比如OpenVINO做GPU推理的时候,如果按照传统的思路MediasSDK/FFMPEG GPU解码 - CPU读出D3D11 buffer里的解码数据再传给OpenCL GPU buffer - OpenVINO做GPU推理,

这种由CPU做VRAM - 系统内存 - VRAM的拷贝方式势必会增加CPU的开销,而且也没必要,所以我一直想找一种方法能够把DirectX11的NV12数据直接在显存里共享给OpenCL框架访问。

首先研究一下简单的OpenCL和D3D11之间的资源共享

在Intel官网有这么一篇文章可以用来参考

Sharing Surfaces between OpenCL™ and DirectX* 11 on Intel® Processor...

大概流程是这样的:

初始化

1. OpenCL 初始化

  • 首先是查询显卡的OpenCL扩展属性支持,里面是否包含"cl_khr_dx11_sharing"属性
char extension_string[1024];
memset(extension_string, '', 1024);
status = clGetPlatformInfo( platforms[i], 
CL_PLATFORM_EXTENSIONS,
sizeof(extension_string), 
          extension_string, 
          NULL);
char *extStringStart = NULL;
extStringStart = strstr(extension_string, "cl_khr_dx11_sharing");
if(extStringStart != 0){
        printf("Platform does support cl_khr_dx11_sharing\n");
        …
}
  •  接下来利用cl_khr_dx11_sharing扩展提供的一些函数从创建的d3d11 device的句柄中获取对应的 OpenCL Device ID, 再利用这个CL Device ID创建OpenCL的context和commandQueue。因为这个ocl context是基于d3d11 device句柄创建的,所以他们之间的内存数据(buffer)是可以互相访问的。
//创建cl context所需的属性
cl_context_properties cps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)g_platformToUse, CL_CONTEXT_D3D11_DEVICE_KHR, (intptr_t)g_pd3dDevice, CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE, 0 };


...


//获取可以和d3d11设备共享的opencl设备数量,我理解对应的是多显卡的时候这个numDevs应该大于1
clGetDeviceIDsFromD3D11KHR_fn ptrToFunction_clGetDeviceIDsFromD3D11KHR = NULL;
ptrToFunction_clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn) clGetExtensionFunctionAddressForPlatform(g_platformToUse, "clGetDeviceIDsFromD3D11KHR");
	
cl_uint numDevs = 0;
//careful with the g_pd3DDevice
status = ptrToFunction_clGetDeviceIDsFromD3D11KHR(g_platformToUse, CL_D3D11_DEVICE_KHR, (void *)g_pd3dDevice, CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 0, NULL, &numDevs);
testStatus(status, "Failed on clGetDeviceIDsFromD3D11KHR");


...

//创建对应的cl context
cl_device_id *devID = NULL;
g_clDevices = (cl_device_id *)malloc(sizeof(cl_device_id) * numDevs);
ptrToFunction_clGetDeviceIDsFromD3D11KHR(g_platformToUse, CL_D3D11_DEVICE_KHR, (void *)g_pd3dDevice, CL_PREFERRED_DEVICES_FOR_D3D11_KHR, numDevs, g_clDevices, NULL);
testStatus(status, "Failed on clGetDeviceIDsFromD3D11KHR");

//create an OCL context from the device we are using as our DX11 rendering device
g_clContext = clCreateContext(cps, 1, g_clDevices, NULL, NULL, &status);
testStatus(status, "clCreateContext error");

//创建cl commandQueue
//create an openCL commandqueue
g_clCommandQueue = clCreateCommandQueue(g_clContext, devID, 0, &status);
testStatus(status, "clCreateCommandQueue error");

 2. 创建和opencl共享的d3d11 texture2D

这里创建D3d11 texture2D的描述符里要带着D3D11_RESOURCE_MISC_SHARED属性

desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED;


void CreateTextureDX11()

{

       unsigned char *texture = NULL;

       texture = (unsigned char *)malloc(sizeof(unsigned char) * NUM_IMAGE_CHANNELS * SHARED_IMAGE_HEIGHT * SHARED_IMAGE_WIDTH);

       if(texture == nullptr)

       {

              printf("error creating texture\n");

       }

 

       for(unsigned int i=0;i<NUM_IMAGE_CHANNELS * SHARED_IMAGE_HEIGHT * SHARED_IMAGE_WIDTH;)

       {

              texture[i++] = 255;

              texture[i++] = 0;

              texture[i++] = 0;

              texture[i++] = 255;

       }

 

       D3D11_TEXTURE2D_DESC desc;

       ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC));

       desc.Width = SHARED_IMAGE_WIDTH;

       desc.Height = SHARED_IMAGE_HEIGHT;

       desc.MipLevels = 1;

       desc.ArraySize = 1;

       desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; desc.SampleDesc.Count = 1;

       desc.SampleDesc.Quality = 0;

       desc.Usage = D3D11_USAGE_DEFAULT;

       desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;

       desc.CPUAccessFlags = 0;

       if(g_UseD3D11_RESOURCE_MISC_SHAREDflag == true)

       {

              printf("Using the D3D11_RESOURCE_MISC_SHARED flag\n");

              desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED;

       }

       else

       {

              desc.MiscFlags = 0;

       }

       D3D11_SUBRESOURCE_DATA tbsd;

       ZeroMemory(&tbsd, sizeof(D3D11_SUBRESOURCE_DATA));

       tbsd.pSysMem = (void *)texture;

       tbsd.SysMemPitch = SHARED_IMAGE_WIDTH * NUM_IMAGE_CHANNELS;

       tbsd.SysMemSlicePitch = SHARED_IMAGE_WIDTH * SHARED_IMAGE_HEIGHT * NUM_IMAGE_CHANNELS;

       g_pd3dDevice->CreateTexture2D(&desc, &tbsd, &g_pSharedDX11Texture2D);

       //still need to bind

       free(texture);      

}

3. 创建和texture 2D共享的openCL buffer对象

这里利用上一步创建出的g_pSharedDX11Texture2D指针来创建OpenCL buffer, 这样创建出的ocl buffer和d3d11 texture2D的像素数据会共享同一段内存空间

int ShareDX11BufferWithCL()
{
	int status = 0;

	g_SharedRGBAimageCLMemObject = ptrToFunction_clCreateFromD3D11Texture2DKHR(g_clContext, CL_MEM_WRITE_ONLY, g_pSharedDX11Texture2D, 0, &status);	
	if(status == 0)
	{
		printf("Successfully shared!\n");
		status = SUCCESS;
	}
	else
	{
		printf("Sharing failed\n");
		status = FAIL;
	}
	return status;
}

实际每帧渲染过程中的数据交互流程

4. 锁定ocl buffer, 防止d3d11访问texture2D里数据

status = ptrToFunction_clEnqueueAcquireD3D11ObjectsKHR(g_clCommandQueue, 1, &g_SharedRGBAimageCLMemObject, 0, 0, 0);

5. 用ocl kernel代码操作ocl buffer

Texture2D buffer对于opencl来说是一个opencl image buffer, 也就是说不能用数据指针直接访问buffer的方式来读取,需要用sampler采样器来读写(因为大部分的硬件解码器访问的frame buffer的格式都是从节省内存带宽的角度考虑设计的,像素的排布不是线性排列,对于intel显卡这部分内容,可以参考https://01.org/sites/default/files/documentation/intel-gfx-prm-osrc-skl-vol05-memory_views.pdf,正文第8页,Linear vs Tiled Storage 里面关于tiled surface layout的描述) 所以访问数据的速度应该是和sampler的数量和性能直接相关。

另外这里发现个很微妙的设定, 传进来的ocl buffer的属性,要么是只写__write_only, 要么是只读__read_only, 根据如果简单的直接改成读写__read_write属性,编译ocl kernel的时候会报错。根据intel的文档看,支持读写属性需要用到OpenCL 2.0+, 同时代码里读写像素要添加barrier之类的同步操作,以免不同的kernel因为执行速度的不同,有些地址的数据会存在先写入后读取,另一些地址的数据存在先读取后写入的不同步错误。

kernel void drawBox(__write_only image2d_t output, float fDimmerSwitch)
{
       int x = get_global_id(0);
       int y = get_global_id(1);

       int xMin = 0, xMax = 1, yMin = 0, yMax = 1;

       if((x >= xMin) && (x <= xMax) && (y >= yMin) && (y <= yMax))
       {      
              write_imagef(output, (int2)(x, y), 
                           (float4)(0.f, 0.f, fDimmerSwitch, 1.f));
       }
} 

6. 解锁ocl buffer, 以便d3d11继续访问texture2D数据

status = ptrToFunction_clEnqueueReleaseD3D11ObjectsKHR(g_clCommandQueue, 1, &g_SharedRGBAimageCLMemObject, 0, NULL, NULL);

接下来再看看OpenCL和D3D11之间的基于NV12格式数据的资源共享

上面这个文章是共享RGBA的D3D11 surface给OpenCL, 但是Intel集显硬件解码器输出的是NV12格式的D3D11 Texture2D. 我需要知道的是应该如何在OpenCL里用read_image/write_image的方法访问NV12数据里的UV数据段。

在网上搜了一圈,这方面的相关内容非常少,但是最后发现踏破铁鞋无觅处,得来全不费工夫 - 答案就藏在随手可得的Intel MediaSDK 的sample_plugin_opencl示例项目里。

1. 分别创建对接Texture2D的Y和UV的2个cl_mem对象

在示例里Texture2D里面的Y和UV数据区,需要创建2个cl_mem对象分别对应。在opencl_filter_dx11.cpp里, CreateSharedSurface()函数的参数nView控制创建的opencl buffer指向texture2D的哪块数据,

nView=0, 创建的cl_mem指向texture2D的Y数据区,

nView=1  则创建的cl_mem指向UV数据区。

cl_mem OpenCLFilterDX11::CreateSharedSurface(mfxMemId mid, int nView, bool bIsReadOnly)
{
    mfxHDLPair mid_pair = { 0 };
    mfxStatus sts = m_pAlloc->GetHDL(m_pAlloc->pthis, mid, reinterpret_cast<mfxHDL*>(&mid_pair));
    if (sts) return 0;

    ID3D11Texture2D *surf = (ID3D11Texture2D*)mid_pair.first;

    cl_int error = CL_SUCCESS;
    cl_mem mem = clCreateFromD3D11Texture2DKHR(m_clcontext, bIsReadOnly ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
                                            surf, nView, &error);
    if (error) {
        log.error() << "clCreateFromD3D11Texture2DKHR failed. Error code: " << error << endl;
        return 0;
    }
    return mem;
}

2. 设置clEnqueueNDRangeKernel()的global work size大小

而根据NV12类似YUV420的特性,UV在垂直和水平方向上的采样率分别是Y的1/2, 所以在clEnqueueNDRangeKernel()传进去的GlobalWorkSize的参数上,UV的global work size要是Y的1/2。这部分的代码在opencl_filter.cpp里

cl_int OpenCLFilterBase::PrepareSharedSurfaces(int width, int height, mfxMemId surf_in, mfxMemId surf_out)
{
...
        // Work sizes for Y plane
        m_GlobalWorkSizeY[0] = m_currentWidth;
        m_GlobalWorkSizeY[1] = m_currentHeight;


...


        // Work size for UV plane
        m_GlobalWorkSizeUV[0] = m_currentWidth / 2;
        m_GlobalWorkSizeUV[1] = m_currentHeight / 2;


...
}

cl_int OpenCLFilterBase::ProcessSurface()
{
...
        // enqueue kernels
        error = clEnqueueNDRangeKernel(m_clqueue, m_kernels[m_activeKernel].clkernelY, 2, NULL, m_GlobalWorkSizeY, m_LocalWorkSizeY, 0, NULL, NULL);
        if (error) {
            log.error() << "clEnqueueNDRangeKernel for Y plane failed. Error code: " << error << endl;
            return error;
        }
        error = clEnqueueNDRangeKernel(m_clqueue, m_kernels[m_activeKernel].clkernelUV, 2, NULL, m_GlobalWorkSizeUV, m_LocalWorkSizeUV, 0, NULL, NULL);
        if (error) {
            log.error() << "clEnqueueNDRangeKernel for UV plane failed. Error code: " << error << endl;
            return error;
        }
...
}

3. OpenCL kernel代码分析

最后看ocl_rotate.cl里的cl kernel部分

对应的OpenCL kernel里的read_imagef()读回来的float4 pixel,

通过这个链接 image2d_t direct pixel access with vload/vstore - Intel Communities可以知道

Y对应的是pixel.xyzw里的pixel.x,

UV对回来的pixel, U对应的是pixel.x, V对应的是pixel.y

读回来的是介于0~1之间的浮点值,要对应上0~255的8bit整数值, 需要再乘以255.0

  U8 Y=pixel.x * 255.0f

  U8 U=pixel.x * 255.0f

  U8 V=pixel.y * 255.0f

__kernel void rotate_Y(__read_only image2d_t YIn, __write_only image2d_t YOut)
{
    int2 coord_src = (int2)(get_global_id(0), get_global_id(1));
    int2 dim = (int2)(get_global_size(0), get_global_size(1));
    int2 coord_dst = dim  - (int2)(1, 1) - coord_src;
    const sampler_t smp = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE;

    // rotate Y plane
    float4 pixel = read_imagef(YIn, smp, coord_src);
    write_imagef(YOut, coord_dst, pixel);
}

__kernel void rotate_UV(__read_only image2d_t UVIn, __write_only image2d_t UVOut)
{
    int2 coord_src = (int2)(get_global_id(0), get_global_id(1));
    int2 dim = (int2)(get_global_size(0), get_global_size(1));
    int2 coord_dst = dim  - (int2)(1, 1) - coord_src;
    const sampler_t smp = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE;

    // rotate UV plane
    float4 pixel = read_imagef(UVIn, smp, coord_src);
    write_imagef(UVOut, coord_dst, pixel);
}

 最后用mediasdk sample_decode的代码验证一下自己的理解

sample_decode在使用-d3d11 -hw -r参数时,即使用硬件解码、走d3d11框架且显示到屏幕

在一开始分配硬件解码器所需要的frame buffer时,每一个frame会生成2个texture2D buffer,分别放到2组vector容器里

  • 第一组Texture2D

创建参数为

        desc.Usage = D3D11_USAGE_DEFAULT;
        desc.MiscFlags = m_initParams.uncompressedResourceMiscFlags | D3D11_RESOURCE_MISC_SHARED;

表示这个Texture只能够由GPU访问,且可以共享给OpenCL框架,保存在newTexture.textures里

        for(size_t i = 0; i < request->NumFrameSuggested / desc.ArraySize; i++)
        {
            hRes = m_initParams.pDevice->CreateTexture2D(&desc, NULL, &pTexture2D);

            if (FAILED(hRes))
            {
                msdk_printf(MSDK_STRING("CreateTexture2D(%lld) failed, hr = 0x%08lx\n"), (long long)i, hRes);
                return MFX_ERR_MEMORY_ALLOC;
            }
            newTexture.textures.push_back(pTexture2D);
        }
  • 第二组Texture2D

 创建参数为

        desc.Usage = D3D11_USAGE_STAGING;
        desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;

表示这个Texture可以从CPU读取,保存在newTexture.stagingTextures里

        for(size_t i = 0; i < request->NumFrameSuggested; i++)
        {
            hRes = m_initParams.pDevice->CreateTexture2D(&desc, NULL, &pTexture2D);

            if (FAILED(hRes))
            {
                msdk_printf(MSDK_STRING("Create staging texture(%lld) failed hr = 0x%X\n"), (long long)i, (unsigned int)hRes);
                return MFX_ERR_MEMORY_ALLOC;
            }
            newTexture.stagingTexture.push_back(pTexture2D);
        }

接下来在输出解码出来的frame buffer时的流程如下

输出的frame buffer放在newTexture.textures里,这个buffer不能从CPU直接访问

  • 如果是输出到文件,则把textures的数据通过CopySubresourceRegion()函数拷贝到stagingTexture里,然后再由CPU读出来写入文件
  • 如果是输出到屏幕,则直接把textures的数据通过VideoProcessorBlt()拷贝到DXGI的backBuffer里,再切换swapchain把backBuffer切换到frontBuffer显示

我修改了一下CD3D11Device::RenderFrame()函数,输出到屏幕流程里,在调用VideoProcessorBlt()前,调用SimulateCL(); 通过OpenCL直接修改newTexture.textures的frame buffer(这个buffer是不能通过CPU直接写入的,但是通过ocl kernel可以在GPU侧直接读写)

在屏幕左上角坐标(y=16, x=32)的地方,画一个48*48的方块区域,此区域亮度做0~255间的明暗变化;同时在左上角坐标(y=16, x=32), 画一个96*96的方块区域,此区域呈灰度显示

void SimulateCL()
{
	cl_int status;
	static float fDimmerSwitch = 0.0f;  //Y = 255 * fDimmerSwitch
	cl_int top = 16;
	cl_int left = 32;

	//see page 107 of rev21 of the spec, clearly states this won't return until all D3D11 has completed
	status = ptrToFunction_clEnqueueAcquireD3D11ObjectsKHR(g_clCommandQueue, 1, &g_SharedNV12imageCLMemObject_Y, 0, 0, 0);

	status = ptrToFunction_clEnqueueAcquireD3D11ObjectsKHR(g_clCommandQueue, 1, &g_SharedNV12imageCLMemObject_UV, 0, 0, 0);


	status = clSetKernelArg(cl_kernel_drawBox_Y, 0, sizeof(cl_mem), &g_SharedNV12imageCLMemObject_Y);
	testStatus(status, "clSetKernelArg");
	status = clSetKernelArg(cl_kernel_drawBox_Y, 1, sizeof(cl_int), &top);
	status = clSetKernelArg(cl_kernel_drawBox_Y, 2, sizeof(cl_int), &left);
	status = clSetKernelArg(cl_kernel_drawBox_Y, 3, sizeof(cl_float), &fDimmerSwitch);
	testStatus(status, "clSetKernelArg");

	status = clSetKernelArg(cl_kernel_drawBox_UV, 0, sizeof(cl_mem), &g_SharedNV12imageCLMemObject_UV);
	status = clSetKernelArg(cl_kernel_drawBox_UV, 1, sizeof(cl_int), &top);
	status = clSetKernelArg(cl_kernel_drawBox_UV, 2, sizeof(cl_int), &left);
	status = clSetKernelArg(cl_kernel_drawBox_UV, 3, sizeof(cl_float), &fDimmerSwitch);

	size_t global_dim[2];
	global_dim[0] = SHARED_IMAGE_HEIGHT;
	global_dim[1] = SHARED_IMAGE_WIDTH;

	status = clEnqueueNDRangeKernel(g_clCommandQueue, cl_kernel_drawBox_Y, 2, NULL, global_dim, NULL, 0, NULL, NULL);
	testStatus(status, "clEnqueueNDRangeKernel fail");
	status = clEnqueueNDRangeKernel(g_clCommandQueue, cl_kernel_drawBox_UV, 2, NULL, global_dim, NULL, 0, NULL, NULL);

	status = ptrToFunction_clEnqueueReleaseD3D11ObjectsKHR(g_clCommandQueue, 1, &g_SharedNV12imageCLMemObject_Y, 0, NULL, NULL);
	testStatus(status, "Fail on clEnqueueReleaseD3D11ObjectsKHR");
	status = ptrToFunction_clEnqueueReleaseD3D11ObjectsKHR(g_clCommandQueue, 1, &g_SharedNV12imageCLMemObject_UV, 0, NULL, NULL);
	clFinish(g_clCommandQueue);

	fDimmerSwitch += .010f;
	if(fDimmerSwitch > 1.0)
	{
		fDimmerSwitch = 0.0f;
	}

}

对应的cl kernel

//Y的亮度根据传进来的fDimmerSwitch做明暗交替的变化
kernel void drawBox_Y( __write_only image2d_t output, int top, int left, float fDimmerSwitch)
{
       int x = get_global_id(0)+left;
       int y = get_global_id(1)+top;


       int xMin = 0, xMax = 1, yMin = 0, yMax = 1;

       {      
              write_imagef(output, (int2)(x, y), (float4)(fDimmerSwitch, 0.f, 0.f, 1.f));

       }
}

//UV的值直接设成0.5f, 0.5f对应的整数值是128, 128 既此区域没有颜色值,只有灰度色
kernel void drawBox_UV( __write_only image2d_t output, int top, int left,float fDimmerSwitch)
{
       int x = get_global_id(0)+left/2;
       int y = get_global_id(1)+top/2;

       {      
              write_imagef(output, (int2)(x, y), (float4)(0.5f, 0.5f, 0.f, 1.f));
       }
}

运行一下

搞定收工 :)

总结一下代码学习中踩过的小坑

  1. d3d11的texture2d对应opencl里的image2d, 所以需要用opencl里的sampler采样器来访问,无法直接用指针通过线性地址来读取写入
  2. mediasdk sample_decode传出来用来显示和保存的frame buffer,还会被用来做后面frame的参考帧,所以通过opencl写入的色块,会导致后面解码帧的错误,可以用肉眼观察到。所以newTexture.textures里的数据,只能读不能写,最好是把数据从newTexture.textures拷贝到自己的frame buffer里去做进一步处理
  3. read_imagef/write_imagef的float4 pixel.xyzw里,Y亮度在pixel.x中, U对应pixel.x, V对应pixel.y, 浮点值在0~1之间,对应整数值需要乘以255来对应我们传统的YUV 8bit数值
  4. NV12格式的texture2D frame buffer, 需要创建2个image2t对象分别对应里面的Y和UV
//Y
g_SharedNV12imageCLMemObject = ptrToFunction_clCreateFromD3D11Texture2DKHR(g_clContext, CL_MEM_READ_WRITE, g_pSharedDX11Texture2D, 0, &status);

//UV
g_SharedNV12imageCLMemObject = ptrToFunction_clCreateFromD3D11Texture2DKHR(g_clContext, CL_MEM_READ_WRITE, g_pSharedDX11Texture2D, 1, &status);

最后照例完整测试代码奉上,仅供参考

d3d11_ocl_sharing: 用Intel mediasdk2020的sample_decode验证通过ocl扩展cl_khr_d3d11_sharing让opencl和d3d11 texture2D共享frame buffer的方法

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值