说明:
HEVC解码并行分3个级别:frame并行,slice并行和wpp。FFmpeg默认提供frame并行和slice并行的框架。针对HEVC,FFmpeg实现的帧内并行是wpp。
1)关于thread_type,也就是并行模式,其实分两种:slice并行和fram+slice并行(注意这句话:Frame thread:Restrictions with slice threading also apply)。所以openHEVC在frame thread init中也会slice thread init;优先判断frame thread;参数命名上,也是默认参数 for slice,特别注明的才是 for frame。
2)FFmpeg中并行解码部分稍显混乱,264与265共用了部分上层框架,但264的并行解码又有若干问题,间接影响了265。
3)openHEVC的并行解码代码就清晰不少,FFMpeg中并行解码部分大概同步到openHEVC 2013年10月提交,后面可能因为框架原因,没有再同步。
1 validate_thread_parameters
设置active_thread_type 对应并行级别
/**
* Set the threading algorithms used.
*
* Threading requires more than one thread.
* Frame threading requires entire frames to be passed to the codec,
* and introduces extra decoding delay, so is incompatible with low_delay.
*
* @param avctx The context.
*/
static void validate_thread_parameters(AVCodecContext *avctx)
{
//! 帧级并行支持标记
int frame_threading_supported = (avctx->codec->capabilities & AV_CODEC_CAP_FRAME_THREADS)
&& !(avctx->flags & AV_CODEC_FLAG_TRUNCATED)
&& !(avctx->flags & AV_CODEC_FLAG_LOW_DELAY)
&& !(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS);
if (avctx->thread_count == 1) { ///< 多核支持
avctx->active_thread_type = 0;
} else if (frame_threading_supported && (avctx->thread_type & FF_THREAD_FRAME)) { ///< codec设置
avctx->active_thread_type = FF_THREAD_FRAME;
} else if (avctx->codec->capabilities & AV_CODEC_CAP_SLICE_THREADS &&
avctx->thread_type & FF_THREAD_SLICE) { ///< slice级并行
avctx->active_thread_type = FF_THREAD_SLICE;
} else if (!(avctx->codec->capabilities & AV_CODEC_CAP_AUTO_THREADS)) {
avctx->thread_count = 1;
avctx->active_thread_type = 0;
} ///< auto 已设置
if (avctx->thread_count > MAX_AUTO_THREADS)
av_log(avctx, AV_LOG_WARNING,
"Application has requested %d threads. Using a thread count greater than %d is not recommended.\n",
avctx->thread_count, MAX_AUTO_THREADS);
}
.capabilities := AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS
12322 = 1<< 1 | 1<< 5 | 1<< 13 | 1<< 12
2
3
4
5 ff_thread_init pthread.c
int ff_thread_init(AVCodecContext *avctx)
{
validate_thread_parameters(avctx); ///< 参1 设置avctx->active_thread_type
if (avctx->active_thread_type&FF_THREAD_SLICE)
return ff_slice_thread_init(avctx); ///< 参7
else if (avctx->active_thread_type&FF_THREAD_FRAME)
return ff_frame_thread_init(avctx); ///< 参6
return 0;
}
avcodec_open2
-ff_thread_init
--ff_slice_thread_initactive_thread_type & FF_THREAD_SLICE
--ff_frame_thread_initactive_thread_type & FF_THREAD_FRAME
从pthread.c中分离出frame, slice级别的代码,独立为pthread_frame.c和pthread_slice.c。
6 ff_frame_thread_init pthread_frame.c
int ff_frame_thread_init(AVCodecContext *avctx)
{
int thread_count = avctx->thread_count;
const AVCodec *codec = avctx->codec;
AVCodecContext *src = avctx;
FrameThreadContext *fctx;
int i, err = 0;
#if HAVE_W32THREADS
w32thread_init();
#endif
if (!thread_count) { ///< 初始化codec阶段未设置thread_count或者设置0,则根据cpu数适配
int nb_cpus = av_cpu_count();
if ((avctx->debug & (FF_DEBUG_VIS_QP | FF_DEBUG_VIS_MB_TYPE)) || avctx->debug_mv)
nb_cpus = 1;
// use number of cores + 1 as thread count if there is more than one
if (nb_cpus > 1)
thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
else
thread_count = avctx->thread_count = 1;
}
if (thread_count <= 1) { ///< 不使用并行
avctx->active_thread_type = 0;
return 0;
}
avctx->internal->thread_ctx = fctx = av_mallocz(sizeof(FrameThreadContext));///< frame thread context
if (!fctx)
return AVERROR(ENOMEM);
fctx->threads = av_mallocz_array(thread_count, sizeof(PerThreadContext)); ///< 初始化线程池: thread_count
if (!fctx->threads) {
av_freep(&avctx->internal->thread_ctx);
return AVERROR(ENOMEM);
}
pthread_mutex_init(&fctx->buffer_mutex, NULL);
fctx->delaying = 1;
for (i = 0; i < thread_count; i++) {
AVCodecContext *copy = av_malloc(sizeof(AVCodecContext));
PerThreadContext *p = &fctx->threads[i];
pthread_mutex_init(&p->mutex, NULL);
pthread_mutex_init(&p->progress_mutex, NULL);
pthread_cond_init(&p->input_cond, NULL);
pthread_cond_init(&p->progress_cond, NULL);
pthread_cond_init(&p->output_cond, NULL);
p->frame = av_frame_alloc();
if (!p->frame) {
av_freep(©);
err = AVERROR(ENOMEM);
goto error;
}
p->parent = fctx;
p->avctx = copy;
if (!copy) {
err = AVERROR(ENOMEM);
goto error;
}
*copy = *src;
copy->internal = av_malloc(sizeof(AVCodecInternal));
if (!copy->internal) {
copy->priv_data = NULL;
err = AVERROR(ENOMEM);
goto error;
}
*copy->internal = *src->internal;
copy->internal->thread_ctx = p;
copy->internal->pkt = &p->avpkt;
if (!i) {
src = copy;
if (codec->init)
err = codec->init(copy);
update_context_from_thread(avctx, copy, 1); ///< 更新下一个线程的AVCodecContext参考线程的上下文
} else {
copy->priv_data = av_malloc(codec->priv_data_size);
if (!copy->priv_data) {
err = AVERROR(ENOMEM);
goto error;
}
memcpy(copy->priv_data, src->priv_data, codec->priv_data_size);
copy->internal->is_copy = 1;
if (codec->init_thread_copy)
err = codec->init_thread_copy(copy);
}
if (err) goto error;
err = AVERROR(pthread_create(&p->thread, NULL, frame_worker_thread, p));///< frame级解码线程
p->thread_init= !err;
if(!p->thread_init)
goto error;
}
return 0;
error:
ff_frame_thread_free(avctx, i+1);
return err;
}
7 ff_slice_thread_init pthread_slice.c
int ff_slice_thread_init(AVCodecContext *avctx)
{
int i;
SliceThreadContext *c;
int thread_count = avctx->thread_count;
#if HAVE_W32THREADS
w32thread_init();
#endif
// We cannot do this in the encoder init as the threads are created before
if (av_codec_is_encoder(avctx->codec) &&
avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
avctx->height > 2800)
thread_count = avctx->thread_count = 1;
if (!thread_count) { ///< 若thread_count为0 则根据cpu数适配
int nb_cpus = av_cpu_count();
if (avctx->height)
nb_cpus = FFMIN(nb_cpus, (avctx->height+15)/16);
// use number of cores + 1 as thread count if there is more than one
if (nb_cpus > 1) ///< cores + 1, 16(264的原因限制)
thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
else
thread_count = avctx->thread_count = 1;
}
if (thread_count <= 1) {
avctx->active_thread_type = 0;
return 0;
}
c = av_mallocz(sizeof(SliceThreadContext)); ///< slice thread context
if (!c)
return -1;
c->workers = av_mallocz_array(thread_count, sizeof(pthread_t));
if (!c->workers) {
av_free(c);
return -1;
}
avctx->internal->thread_ctx = c;
c->current_job = 0;
c->job_count = 0;
c->job_size = 0;
c->done = 0;
pthread_cond_init(&c->current_job_cond, NULL);
pthread_cond_init(&c->last_job_cond, NULL);
pthread_mutex_init(&c->current_job_lock, NULL);
pthread_mutex_lock(&c->current_job_lock);
for (i=0; i<thread_count; i++) {
if(pthread_create(&c->workers[i], NULL, worker, avctx)) { ///< worker线程创建
avctx->thread_count = i;
pthread_mutex_unlock(&c->current_job_lock);
ff_thread_free(avctx);
return -1;
}
}
thread_park_workers(c, thread_count); ///< 设置thread_count(成功创建的worker现程数)
avctx->execute = thread_execute;
avctx->execute2 = thread_execute2;
return 0;
}
///< 若thread_count为0 则根据cpu数适配
int nb_cpus = av_cpu_count();
if (avctx->height)
nb_cpus = FFMIN(nb_cpus, (avctx->height+15)/16);
// use number of cores + 1 as thread count if there is more than one
if (nb_cpus > 1) ///< cores + 1, 16(264的原因限制)
thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS);
else
thread_count = avctx->thread_count = 1;
}
if (thread_count <= 1) {
avctx->active_thread_type = 0;
return 0;
}
c = av_mallocz(sizeof(SliceThreadContext)); ///< slice thread context
if (!c)
return -1;
c->workers = av_mallocz_array(thread_count, sizeof(pthread_t));
if (!c->workers) {
av_free(c);
return -1;
}
avctx->internal->thread_ctx = c;
c->current_job = 0;
c->job_count = 0;
c->job_size = 0;
c->done = 0;
pthread_cond_init(&c->current_job_cond, NULL);
pthread_cond_init(&c->last_job_cond, NULL);
pthread_mutex_init(&c->current_job_lock, NULL);
pthread_mutex_lock(&c->current_job_lock);
for (i=0; i<thread_count; i++) {
if(pthread_create(&c->workers[i], NULL, worker, avctx)) { ///< worker线程创建
avctx->thread_count = i;
pthread_mutex_unlock(&c->current_job_lock);
ff_thread_free(avctx);
return -1;
}
}
thread_park_workers(c, thread_count); ///< 设置thread_count(成功创建的worker现程数)
avctx->execute = thread_execute;
avctx->execute2 = thread_execute2;
return 0;
}
openHEVC中,frame,slice thread 参数分开,更清晰!
8 submit_packet
主线程将packet submit给解码线程 frame worker thread。线程的异步操作,用到条件变量和互斥量。
//! 异步通信frame thread p解码packet avpkt PerThreadContext* 存储threads' context
static int submit_packet(PerThreadContext *p, AVPacket *avpkt)
{
FrameThreadContext *fctx = p->parent;
PerThreadContext *prev_thread = fctx->prev_thread;
const AVCodec *codec = p->avctx->codec;
if (!avpkt->size && !(codec->capabilities & AV_CODEC_CAP_DELAY))
return 0;
pthread_mutex_lock(&p->mutex);
release_delayed_buffers(p);
if (prev_thread) {
int err;
if (prev_thread->state == STATE_SETTING_UP) {
pthread_mutex_lock(&prev_thread->progress_mutex);
while (prev_thread->state == STATE_SETTING_UP)
pthread_cond_wait(&prev_thread->progress_cond, &prev_thread->progress_mutex);
pthread_mutex_unlock(&prev_thread->progress_mutex);
}
err = update_context_from_thread(p->avctx, prev_thread->avctx, 0); ///< prev_thread状态变更(setup->finished),则更新context
if (err) {
pthread_mutex_unlock(&p->mutex);
return err;
}
}
av_packet_unref(&p->avpkt);
av_packet_ref(&p->avpkt, avpkt);
p->state = STATE_SETTING_UP;
pthread_cond_signal(&p->input_cond); ///< 发送input packet完成准备的信号 -> frame解码线程
pthread_mutex_unlock(&p->mutex);
/*
* If the client doesn't have a thread-safe get_buffer(),
* then decoding threads call back to the main thread,
* and it calls back to the client here.
*/
if (!p->avctx->thread_safe_callbacks && (
p->avctx->get_format != avcodec_default_get_format ||
p->avctx->get_buffer2 != avcodec_default_get_buffer2)) {
while (p->state != STATE_SETUP_FINISHED && p->state != STATE_INPUT_READY) {
int call_done = 1;
pthread_mutex_lock(&p->progress_mutex);
while (p->state == STATE_SETTING_UP)
pthread_cond_wait(&p->progress_cond, &p->progress_mutex); ///< 等待解码线程的progress_cond
///< 使用通用接口get_buffer()获取
switch (p->state) {
case STATE_GET_BUFFER:
p->result = ff_get_buffer(p->avctx, p->requested_frame, p->requested_flags);
break;
case STATE_GET_FORMAT:
p->result_format = ff_get_format(p->avctx, p->available_formats);
break;
default:
call_done = 0;
break;
}
if (call_done) {
p->state = STATE_SETTING_UP;
pthread_cond_signal(&p->progress_cond);
}
pthread_mutex_unlock(&p->progress_mutex);
}
}
fctx->prev_thread = p;
fctx->next_decoding++;
return 0;
}