ffplay使用dxva2实现硬解渲染

CodeOfCC

已于 2022-05-15 10:24:09 修改

阅读量2k

点赞数

分类专栏：音视频文章标签： c++ ffmpeg dxva2 视频编解码播放器

于 2022-05-14 21:06:00 首次发布

本文链接：https://blog.csdn.net/u013113678/article/details/124773187

版权

音视频专栏收录该内容

59 篇文章 31 订阅

订阅专栏

ffplay自定义系列

第一章自定义播放器接口
 第二章倍速播放
 第三章 dxva2硬解渲染(本章)
第四章提供C#接口
 第五章制作wpf播放器

文章目录

ffplay自定义系列
前言
一、ffmpeg使用dxva2
二、解码
三、渲染
总结

前言

ffplay本身是支持设置解码器的，比如设置h264_qsv、hevc_cuvid等就可以使用硬解功能，实际测试确实是有效的，cpu使用率也是有所下降。但是这并不是最佳的方案，在Windows上更好的方案是使用dxva2解码然后使用d3d9渲染，这种方法不仅极大降低cpu使用率、gpu使用率也有所下降、同时解码速度也比较快。但是ffplay本身是不支持使用dxva2的，所以这个时候就要我们进行拓展了。

一、ffmpeg使用dxva2

dxva2解码渲染包含2个步骤：解码和渲染。之所以是很优的方案是因为，解码和渲染都是显卡中处理，解码的数据不需要取出到内存，直接在显存转换然后渲染。ffmpeg有包含dxva2的示例代码，但没有显卡渲染功能，性能还是和设置解码器没有区别。我们需要参考的是项目名称叫Win32Project1的ffmpeg_dxva2解码渲染的博文（暂时没找到当时那篇，就不贴其他类似链接了）

头文件如下:

/*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef FFMPEG_DXVA2_H
#define FFMPEG_DXVA2_H

//#include "windows.h"

extern "C"{
#include "libavcodec/avcodec.h"
#include "libavutil/pixfmt.h"
#include "libavutil/rational.h"
}

enum HWAccelID {
    HWACCEL_NONE = 0,
    HWACCEL_AUTO,
    HWACCEL_VDPAU,
    HWACCEL_DXVA2,
    HWACCEL_VDA,
    HWACCEL_VIDEOTOOLBOX,
    HWACCEL_QSV,
};

typedef struct AVStream AVStream;
typedef struct AVCodecContext AVCodecContext;
typedef struct AVCodec AVCodec;
typedef struct AVFrame AVFrame;
typedef struct AVDictionary AVDictionary;

typedef struct InputStream {
    int file_index;
    AVStream *st;
    int discard;             /* true if stream data should be discarded */
    int user_set_discard;
    int decoding_needed;     /* non zero if the packets must be decoded in 'raw_fifo', see DECODING_FOR_* */
#define DECODING_FOR_OST    1
#define DECODING_FOR_FILTER 2

    AVCodecContext *dec_ctx;
    AVCodec *dec;
    AVFrame *decoded_frame;
    AVFrame *filter_frame; /* a ref of decoded_frame, to be sent to filters */

    int64_t       start;     /* time when read started */
    /* predicted dts of the next packet read for this stream or (when there are
        * several frames in a packet) of the next frame in current packet (in AV_TIME_BASE units) */
    int64_t       next_dts;
    int64_t       dts;       ///< dts of the last packet read for this stream (in AV_TIME_BASE units)

    int64_t       next_pts;  ///< synthetic pts for the next decode frame (in AV_TIME_BASE units)
    int64_t       pts;       ///< current pts of the decoded frame  (in AV_TIME_BASE units)
    int           wrap_correction_done;

    int64_t filter_in_rescale_delta_last;

    int64_t min_pts; /* pts with the smallest value in a current stream */
    int64_t max_pts; /* pts with the higher value in a current stream */
    int64_t nb_samples; /* number of samples in the last decoded audio frame before looping */

    double ts_scale;
    int saw_first_ts;
    int showed_multi_packet_warning;
    AVDictionary *decoder_opts;
    AVRational framerate;               /* framerate forced with -r */
    int top_field_first;
    int guess_layout_max;

    int autorotate;
    int resample_height;
    int resample_width;
    int resample_pix_fmt;

    int      resample_sample_fmt;
    int      resample_sample_rate;
    int      resample_channels;
    uint64_t resample_channel_layout;

    int fix_sub_duration;
    struct { /* previous decoded subtitle and related variables */
        int got_output;
        int ret;
        AVSubtitle subtitle;
    } prev_sub;

    struct sub2video {
        int64_t last_pts;
        int64_t end_pts;
        AVFrame *frame;
        int w, h;
    } sub2video;

    int dr1;

    /* decoded data from this stream goes into all those filters
        * currently video and audio only */
    //InputFilter **filters;
    //int        nb_filters;

    //int reinit_filters;

    /* hwaccel options */
    enum HWAccelID hwaccel_id;
    char  *hwaccel_device;

    /* hwaccel context */
    enum HWAccelID active_hwaccel_id;
    void  *hwaccel_ctx;
    void(*hwaccel_uninit)(AVCodecContext *s);
    int(*hwaccel_get_buffer)(AVCodecContext *s, AVFrame *frame, int flags);
    int(*hwaccel_retrieve_data)(AVCodecContext *s, AVFrame *frame);
    enum AVPixelFormat hwaccel_pix_fmt;
    enum AVPixelFormat hwaccel_retrieved_pix_fmt;

    /* stats */
    // combined size of all the packets read
    uint64_t data_size;
    /* number of packets successfully read for this stream */
    uint64_t nb_packets;
    // number of frames/samples retrieved from the decoder
    uint64_t frames_decoded;
    uint64_t samples_decoded;
} InputStream;


int dxva2_init(AVCodecContext *s, HWND hwnd);
int dxva2_retrieve_data_call(AVCodecContext *s, AVFrame *frame);

#endif /* FFMPEG_DXVA2_H */

二、解码

修改ffplay解码功能需要在stream_component_open中进行：

1、添加字段

引用Win32Project1的ffmpeg_dxva2.h头文件

#include "ffmpeg_dxva2.h"

定义一个枚举说明硬件加速类型

/// <summary>
/// 硬件加速选项
/// </summary>
typedef enum
{
	AC_HARDWAREACCELERATETYPE_DISABLED,
	AC_HARDWAREACCELERATETYPE_AUTO,
	//使用dxva解码,仅在Windows有效,成功启动：started、display事件的pixformat为AC_PIXELFORMAT_DXVA2_VLD，render事件的data[3]为d3d9的surface对象。
	AC_HARDWAREACCELERATETYPE_DXVA
}ACHardwareAccelerateType;

在VideoState中添加如下字段硬件加速类型，以及Win32Project1的InputStream对象

ACHardwareAccelerateType hwaccel;
InputStream* ist;

添加相应接口

//设置硬件加速类型
void ac_play_setHardwareAccelerateType(ACPlay  play, ACHardwareAccelerateType value) {
	VideoState* s = (VideoState*)play;
	s->hwaccel = value;
}

2、初始化

在stream_component_open的avcodec_open2上一行，加入判断hwaccel初始化dxva逻辑。dxva2_init就是Win32Project1中的方法，此方法一定要有hwnd，这个hwnd必须是渲染窗口的。如果不想设置hwnd达到相同性能则需要另外做修改，本文就不深入讨论了。

if (is->hwaccel == AC_HARDWAREACCELERATETYPE_AUTO || is->hwaccel == AC_HARDWAREACCELERATETYPE_DXVA)
{
	switch (codec->id)
	//dxva2支持的格式
	{
	case AV_CODEC_ID_MPEG2VIDEO:
	case AV_CODEC_ID_H264:
	case AV_CODEC_ID_VC1:
	case AV_CODEC_ID_WMV3:
	case AV_CODEC_ID_HEVC:
	case AV_CODEC_ID_VP9:
		//while (1)
	{
		avctx->thread_count = 1;  // Multithreading is apparently not compatible with hardware decoding
		is->ist = av_mallocz(sizeof(InputStream));
		is->ist->hwaccel_id = HWACCEL_AUTO;
		is->ist->active_hwaccel_id = HWACCEL_AUTO;
		is->ist->hwaccel_device = "dxva2";
		is->ist->dec = codec;
		is->ist->dec_ctx = avctx;
		avctx->opaque = is->ist;
		if (dxva2_init(avctx, is->hwnd) == 0)
		{
			avctx->get_buffer2 = is->ist->hwaccel_get_buffer;
			avctx->get_format = GetHwFormat;
			avctx->thread_safe_callbacks = 1;
			avctx->pix_fmt = AV_PIX_FMT_DXVA2_VLD;
		}
		else
		{
			av_free(is->ist);
			is->ist = NULL;
		}		
	}
	break;
	}
}

将解码的avframe的格式设置为AV_PIX_FMT_DXVA2_VLD，上述代码中的GetHwFormat具体如下：

static enum AVPixelFormat GetHwFormat(AVCodecContext* s, const enum AVPixelFormat* pix_fmts)
{
	InputStream* ist = (InputStream*)s->opaque;
	ist->active_hwaccel_id = HWACCEL_DXVA2;
	ist->hwaccel_pix_fmt = AV_PIX_FMT_DXVA2_VLD;
	return ist->hwaccel_pix_fmt;
}

3、反初始化

在stream_close中加入如下反初始化代码，其中dxva2_uninit2是Win32Project1中的dxva2_uninit将其参数类型改为了InputStream*。

if (is->ist)
{
	dxva2_uninit2(is->ist);
	av_free(is->ist);
	is->ist = NULL;
}

三、渲染

有了上述的解码设置之后，解码出来的数据将是d3d9的surface，这个对象在avframe.data[3]中，我们需要对它进行处理，将其显示到界面上。幸运的是Win32Project1包含了这部分功能，我们只需要调用方法就可以了。
在video_display的SDL_RenderClear上一行加入如下代码。

Frame* vp;
vp = frame_queue_peek_last(&is->pictq);
if (vp->format == AV_PIX_FMT_DXVA2_VLD)
{
	dxva2_retrieve_data_call(is->viddec.avctx, vp->frame);
	return;
}

总结

以上就是今天要讲的内容，通过上述方法实现的dxva2硬解渲染性能非常好，直观的感受就是渲染4k视频cpu使用率不超过1%,当然此时gpu使用率可能是50%左右的，但是设置硬解编码器的效果就是10%的cpu使用率，gpu使用率也是50%左右。原因在Win32Project1_ffmpeg_dxva2的博文中有说明，这里就不重复了。总的来说，ffplay支持xva2硬解渲染后就有更广泛和实际的应用了，比如直接解决了实时流多路渲染性能不足的问题等。