之前写过ffmpeg录制麦克风声音和pc内部声音(如播放歌曲)—混音,现在再进一步,混音后,再和采集到的桌面视频合在一处。
现在说说大致思路,如下所示,创建了四个线程
HANDLE hThreadAudio = CreateThread(NULL, 0, AudioCapThreadProc, 0, 0, NULL);
HANDLE hThreadAudioMic = CreateThread(NULL, 0, AudioMicCapThreadProc, 0, 0, NULL);
HANDLE hThreadAudioMix = CreateThread(NULL, 0, AudioMixThreadProc, 0, 0, NULL);
HANDLE hThreadVideo = CreateThread(NULL, 0, ScreenCapThreadProc, 0, 0, NULL);
其中线程hThreadAudio代表的是本地pc内部声音采集,线程hThreadAudioMic代表的是本地麦克风声音采集。线程hThreadAudioMix专门用于上面两路音频混音,线程hThreadVideo用于抓取桌面视频,然后主线程对混音后的音频数据和视频数据进行合并。
接着我们看如下四个队列变量:
AVFifoBuffer *fifo_video = NULL;
AVAudioFifo *fifo_audio = NULL;
AVAudioFifo *fifo_audio_mic = NULL;
///fifo_audio_mix是两路音频混音后,声音所要塞入的队列,便于后续做音视频同步
AVAudioFifo *fifo_audio_mix = NULL;
其中fifo_video是线程hThreadVideo抓取的视频数据所入的队列;fifo_audio是线程hThreadAudio抓取的电脑内部音频数据所入的队列;fifo_audio_mic是线程hThreadAudioMic抓取的电脑麦克风音频数据所入的队列,fifo_audio_mix是电脑内部音频和麦克风音频数据合并后,缩入的队列。
主线程负责从fifo_video取视频数据,从fifo_audio_mix取混音后的音频数据,然后写入文件。
音频格式上,hThreadAudio和hThreadAudioMic采集到的都是AV_SAMPLE_FMT_S16格式,故混音时,采取的格式也是AV_SAMPLE_FMT_S16。最后写文件的时候,需要转换成AV_SAMPLE_FMT_FLTP格式。
下面是完整代码:
// FfmpegAudioTest.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//
#include <Windows.h>
#include <conio.h>
#ifdef __cplusplus
extern "C"
{
#endif
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"
#include "libavdevice/avdevice.h"
#include "libavutil/audio_fifo.h"
#include "libavutil/avutil.h"
#include "libavutil/fifo.h"
#include "libavutil/frame.h"
#include "libavutil/imgutils.h"
#include "libavfilter/avfilter.h"
#include "libavfilter/buffersink.h"
#include "libavfilter/buffersrc.h"
#include "SDL.h"
#pragma comment(lib, "avcodec.lib")
#pragma comment(lib, "avformat.lib")
#pragma comment(lib, "avutil.lib")
#pragma comment(lib, "avdevice.lib")
#pragma comment(lib, "avfilter.lib")
//#pragma comment(lib, "avfilter.lib")
//#pragma comment(lib, "postproc.lib")
#pragma comment(lib, "swresample.lib")
#pragma comment(lib, "swscale.lib")
#pragma comment(lib, "SDL2.lib")
#ifdef __cplusplus
};
#endif
AVFormatContext *pFormatCtx_Audio = NULL;
AVFormatContext *pFormatCtx_AudioMic = NULL;
AVFormatContext *pFormatCtx_Out = NULL;
AVFormatContext *pFormatCtx_Video = NULL;
AVCodecContext *pReadCodecCtx_Video = NULL;
AVCodecContext *pReadCodecCtx_Audio = NULL;
AVCodecContext *pReadCodecCtx_AudioMic = NULL;
AVCodec *pReadCodec_Video = NULL;
int iVideoStreamIndex = 0;
int iAudioStreamIndex = 1;
AVCodecContext *pCodecEncodeCtx_Video = NULL;
AVCodecContext *pCodecEncodeCtx_Audio = NULL;
AVCodec *pCodecEncode_Audio = NULL;
AVFifoBuffer *fifo_video = NULL;
AVAudioFifo *fifo_audio = NULL;
AVAudioFifo *fifo_audio_mic = NULL;
///fifo_audio_mix是两路音频混音后,声音所要塞入的队列,便于后续做音视频同步
AVAudioFifo *fifo_audio_mix = NULL;
SwsContext *img_convert_ctx = NULL;
SwrContext *audio_convert_ctx = NULL;
uint8_t *picture_buf = NULL, *frame_buf = NULL;
bool bCap = true;
int AudioFrameIndex_mix = 0;
int64_t cur_pts_v = 0;
int64_t cur_pts_a = 0;
int yuv420_frame_size = 0;
AVFilterGraph* _filter_graph = NULL;
AVFilterContext* _filter_ctx_src_inner = NULL;
AVFilterContext* _filter_ctx_src_mic = NULL;
AVFilterContext* _filter_ctx_sink = NULL;
CRITICAL_SECTION VideoSection;
CRITICAL_SECTION AudioSection;
CRITICAL_SECTION AudioSection_mic;
CRITICAL_SECTION AudioSection_mix;
DWORD WINAPI AudioCapThreadProc(LPVOID lpParam);
DWORD WINAPI AudioMicCapThreadProc(LPVOID lpParam);
DWORD WINAPI AudioMixThreadProc(LPVOID lpParam);
DWORD WINAPI ScreenCapThreadProc(LPVOID lpParam);
typedef struct BufferSourceContext {
const AVClass *bscclass;
AVFifoBuffer *fifo;
AVRational time_base; ///< time_base to set in the output link
AVRational frame_rate; ///< frame_rate to set in the output link
unsigned nb_failed_requests;
unsigned warning_limit;
/* video only */
int w, h;
enum AVPixelFormat pix_fmt;
AVRational pixel_aspect;
char *sws_param;
AVBufferRef *hw_frames_ctx;
/* audio only */
int sample_rate;
enum AVSampleFormat sample_fmt;
int channels;
uint64_t channel_layout;
char *channel_layout_str;
int got_format_from_params;
int eof;
} BufferSourceContext;
static char *dup_wchar_to_utf8(const wchar_t *w)
{
char *s = NULL;
int l = WideCharToMultiByte(CP_UTF8, 0, w, -1, 0, 0, 0, 0);
s = (char *)av_malloc(l);
if (s)
WideCharToMultiByte(CP_UTF8, 0, w, -1, s, l, 0, 0);
return s;
}
/* just pick the highest supported samplerate */
static int select_sample_rate(const AVCodec *codec)
{
const int *p;
int best_samplerate = 0;
if (!codec->supported_samplerates)
return 44100;
p = codec->supported_samplerates;
while (*p) {
if (!best_samplerate || abs(44100 - *p) < abs(44100 - best_samplerate))
best_samplerate = *p;
p++;
}
return best_samplerate;
}
/* select layout with the highest channel count */
static int select_channel_layout(const AVCodec *codec)
{
const uint64_t *p;
uint64_t best_ch_layout = 0;
int best_nb_channels = 0;
if (!codec->channel_layouts)
return AV_CH_LAYOUT_STEREO;
p = codec->channel_layouts;
while (*p) {
int nb_channels = av_get_channel_layout_nb_channels(*p);
if (nb_channels > best_nb_channels) {
best_ch_layout = *p;
best_nb_channels = nb_channels;
}
p++;
}
return best_ch_layout;
}
int InitFilter(const char* filter_desc)
{
char args_inner[512];
const char* pad_name_inner = "in0";
char args_mic[512];
const char* pad_name_mic = "in1";
AVFilter* filter_src_spk = (AVFilter *)avfilter_get_by_name("abuffer");
AVFilter* filter_src_mic = (AVFilter *)avfilter_get_by_name("abuffer");
AVFilter* filter_sink = (AVFilter *)avfilter_get_by_name("abuffersink");
AVFilterInOut* filter_output_inner = avfilter_inout_alloc();
AVFilterInOut* filter_output_mic = avfilter_inout_alloc();
AVFilterInOut* filter_input = avfilter_inout_alloc();
_filter_graph = avfilter_graph_alloc();
sprintf_s(args_inner, sizeof(args_inner), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%I64x",
pReadCodecCtx_Audio->time_base.num,
pReadCodecCtx_Audio->time_base.den,
pReadCodecCtx_Audio->sample_rate,
av_get_sample_fmt_name((AVSampleFormat)pReadCodecCtx_Audio->sample_fmt),
pReadCodecCtx_Audio->channel_layout);
sprintf_s(args_mic, sizeof(args_mic), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%I64x",
pReadCodecCtx_AudioMic->time_base.num,
pReadCodecCtx_AudioMic->time_base.den,
pReadCodecCtx_AudioMic->sample_rate,
av_get_sample_fmt_name((AVSampleFormat)pReadCodecCtx_AudioMic->sample_fmt),
pReadCodecCtx_AudioMic->channel_layout);
//sprintf_s(args_spk, sizeof(args_spk), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%I64x", _fmt_ctx_out->streams[_index_a_out]->codec->time_base.num, _fmt_ctx_out->streams[_index_a_out]->codec->time_base.den, _fmt_ctx_out->streams[_index_a_out]->codec->sample_rate, av_get_sample_fmt_name(_fmt_ctx_out->streams[_index_a_out]->codec->sample_fmt), _fmt_ctx_out->streams[_index_a_out]->codec->channel_layout);
//sprintf_s(args_mic, sizeof(args_mic), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%I64x", _fmt_ctx_out->streams[_index_a_out]->codec->time_base.num, _fmt_ctx_out->streams[_index_a_out]->codec->time_base.den, _fmt_ctx_out->streams[_index_a_out]->codec->sample_rate, av_get_sample_fmt_name(_fmt_ctx_out->streams[_index_a_out]->codec->sample_fmt), _fmt_ctx_out->streams[_index_a_out]->codec->channel_layout);
int ret = 0;
ret = avfilter_graph_create_filter(&_filter_ctx_src_inner, filter_src_spk, pad_name_inner, args_inner, NULL, _filter_graph);
if (ret < 0)
{
printf("Filter: failed to call avfilter_graph_create_filter -- src inner\n");
return -1;
}
ret = avfilter_graph_create_filter(&_filter_ctx_src_mic, filter_src_mic, pad_name_mic, args_mic, NULL, _filter_graph);
if (ret < 0)
{
printf("Filter: failed to call avfilter_graph_create_filter -- src mic\n");
return -1;
}
ret = avfilter_graph_create_filter(&_filter_ctx_sink, filter_sink, "out", NULL, NULL, _filter_graph);
if (ret < 0)
{
printf("Filter: failed to call avfilter_graph_create_filter -- sink\n");
return -1;
}
AVCodecContext* encodec_ctx = pCodecEncodeCtx_Audio;
//ret = av_opt_set_bin(_filter_ctx_sink, "sample_fmts", (uint8_t*)&encodec_ctx->sample_fmt, sizeof(encodec_ctx->sample_fmt), AV_OPT_SEARCH_CHILDREN);
ret = av_opt_set_bin(_filter_ctx_sink, "sample_fmts", (uint8_t*)&pReadCodecCtx_Audio->sample_fmt, sizeof(pReadCodecCtx_Audio->sample_fmt), AV_OPT_SEARCH_CHILDREN);
if (ret < 0)
{
printf("Filter: failed to call av_opt_set_bin -- sample_fmts\n");
return -1;
}
ret = av_opt_set_bin(_filter_ctx_sink, "channel_layouts", (uint8_t*)&encodec_ctx->channel_layout, sizeof(encodec_ctx->channel_layout), AV_OPT_SEARCH_CHILDREN);
if (ret < 0)
{
printf("Filter: failed to call av_opt_set_bin -- channel_layouts\n");
return -1;
}
ret = av_opt_set_bin(_filter_ctx_sink, "sample_rates", (uint8_t*)&encodec_ctx->sample_rate, sizeof(encodec_ctx->sample_rate), AV_OPT_SEARCH_CHILDREN);
if (ret < 0)
{
printf("Filter: failed to call av_opt_set_bin -- sample_rates\n");
return -1;
}
filter_output_inner->name = av_strdup(pad_name_inner);
filter_output_inner->filter_ctx = _filter_ctx_src_inner;
filter_output_inner->pad_idx = 0;
filter_output_inner->next = filter_output_mic;
filter_output_mic->name = av_strdup(pad_name_mic);
filter_output_mic->filter_ctx = _filter_ctx_src_mic;
filter_output_mic->pad_idx = 0;
filter_output_mic->next = NULL;
filter_input->name = av_strdup("out");
filter_input->filter_ctx = _filter_ctx_sink;
filter_input->pad_idx = 0;
filter_input->next = NULL;
AVFilterInOut* filter_outputs[2];
filter_outputs[0] = filter_output_inner;
filter_outputs[1] = filter_output_mic;
ret = avfilter_graph_parse_ptr(_filter_graph, filter_desc, &filter_input, filter_outputs, NULL);
if (ret < 0)
{
printf("Filter: failed to call avfilter_graph_parse_ptr\n");
return -1;
}
ret = avfilter_graph_config(_filter_graph, NULL);
if (ret < 0)
{
printf("Filter: failed to call avfilter_graph_config\n");
return -1;
}
avfilter_inout_free(&filter_input);
av_free(filter_src_spk);
av_free(filter_src_mic);
avfilter_inout_free(filter_outputs);
//av_free(filter_outputs);
char* temp = avfilter_graph_dump(_filter_graph, NULL);
printf("%s\n", temp);
return 0;
}
int OpenVideoCapture()
{
const AVInputFormat *ifmt = av_find_input_format("gdigrab");
//这里可以加参数打开,例如可以指定采集帧率
AVDictionary *options = NULL;
av_dict_set(&options, "framerate", "25", NULL);
av_dict_set(&options, "probesize", "50000000", NULL);
//av_dict_set(&options,"offset_x","20",0);
//The distance from the top edge of the screen or desktop
//av_dict_set(&options,"offset_y","40",0);
//Video frame size. The default is to capture the full screen
//av_dict_set(&options,"video_size","320x240",0);
if (avformat_open_input(&pFormatCtx_Video, "desktop", ifmt, &options) != 0)
{
printf("Couldn't open input stream.(无法打开视频输入流)\n");
return -1;
}
if (avformat_find_stream_info(pFormatCtx_Video, NULL) < 0)
{
printf("Couldn't find stream information.(无法获取视频流信息)\n");
return -1;
}
if (pFormatCtx_Video->streams[0]->codecpar->codec_type != AVMEDIA_TYPE_VIDEO)
{
printf("Couldn't find video stream information.(无法获取视频流信息)\n");
return -1;
}
pReadCodec_Video = (AVCodec *)avcodec_find_decoder(pFormatCtx_Video->streams[0]->codecpar->codec_id);
pReadCodecCtx_Video = avcodec_alloc_context3(pReadCodec_Video);
if (pReadCodec_Video == NULL)
{
printf("Codec not found.(没有找到解码器)\n");
return -1;
}
if (avcodec_open2(pReadCodecCtx_Video, pReadCodec_Video, NULL) < 0)
{
printf("Could not open codec.(无法打开解码器)\n");
return -1;
}
/* put sample parameters */
pReadCodecCtx_Video->bit_rate = 400000;
/* resolution must be a multiple of two */
pReadCodecCtx_Video->width = 1920;
pReadCodecCtx_Video->height = 1080;
/* frames per second */
AVRational timeBase;
timeBase.num = 1;
timeBase.den = 25;
pReadCodecCtx_Video->time_base = timeBase;
AVRational frameRate;
frameRate.den = 1;
frameRate.num = 25;
pReadCodecCtx_Video->framerate = frameRate;
/* emit one intra frame every ten frames
* check frame pict_type before passing frame
* to encoder, if frame->pict_type is AV_PICTURE_TYPE_I
* then gop_size is ignored and the output of encoder
* will always be I frame irrespective to gop_size
*/
pReadCodecCtx_Video->gop_size = 25;
pReadCodecCtx_Video->max_b_frames = 1;
pReadCodecCtx_Video->pix_fmt = AV_PIX_FMT_YUV420P;
img_convert_ctx = sws_getContext(pReadCodecCtx_Video->width, pReadCodecCtx_Video->height, (AVPixelFormat)pFormatCtx_Video->streams[0]->codecpar->format,
pReadCodecCtx_Video->width, pReadCodecCtx_Video->height, AV_PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL);
yuv420_frame_size = av_image_get_buffer_size(pReadCodecCtx_Video->pix_fmt, pReadCodecCtx_Video->width, pReadCodecCtx_Video->height, 1);
//申请30帧缓存
fifo_video = av_fifo_alloc(30 * yuv420_frame_size);
return 0;
}
int OpenAudioCapture()
{
//查找输入方式
const AVInputFormat *pAudioInputFmt = av_find_input_format("dshow");
//以Direct Show的方式打开设备,并将 输入方式 关联到格式上下文
//const char * psDevName = dup_wchar_to_utf8(L"audio=麦克风 (2- Synaptics HD Audio)");
char * psDevName = dup_wchar_to_utf8(L"audio=virtual-audio-capturer");
if (avformat_open_input(&pFormatCtx_Audio, psDevName, pAudioInputFmt, NULL) < 0)
{
printf("Couldn't open input stream.(无法打开音频输入流)\n");
return -1;
}
if (pFormatCtx_Audio->streams[0]->codecpar->codec_type != AVMEDIA_TYPE_AUDIO)
{
printf("Couldn't find video stream information.(无法获取音频流信息)\n");
return -1;
}
const AVCodec *tmpCodec = avcodec_find_decoder(pFormatCtx_Audio->streams[0]->codecpar->codec_id);
pReadCodecCtx_Audio = avcodec_alloc_context3(tmpCodec);
pReadCodecCtx_Audio->sample_rate = select_sample_rate(tmpCodec);
pReadCodecCtx_Audio->channel_layout = select_channel_layout(tmpCodec);
pReadCodecCtx_Audio->channels = av_get_channel_layout_nb_channels(pReadCodecCtx_Audio->channel_layout);
pReadCodecCtx_Audio->sample_fmt = (AVSampleFormat)pFormatCtx_Audio->streams[0]->codecpar->format;
//pReadCodecCtx_Audio->sample_fmt = AV_SAMPLE_FMT_FLTP;
if (0 > avcodec_open2(pReadCodecCtx_Audio, tmpCodec, NULL))
{
printf("can not find or open audio decoder!\n");
}
avcodec_parameters_from_context(pFormatCtx_Audio->streams[0]->codecpar, pReadCodecCtx_Audio);
return 0;
}
int OpenAudioMicCapture()
{
//查找输入方式
const AVInputFormat *pAudioInputFmt = av_find_input_format("dshow");
//以Direct Show的方式打开设备,并将 输入方式 关联到格式上下文
const char * psDevName = dup_wchar_to_utf8(L"audio=麦克风 (2- Synaptics HD Audio)");
if (avformat_open_input(&pFormatCtx_AudioMic, psDevName, pAudioInputFmt, NULL) < 0)
{
printf("Couldn't open input stream.(无法打开音频输入流)\n");
return -1;
}
if (pFormatCtx_AudioMic->streams[0]->codecpar->codec_type != AVMEDIA_TYPE_AUDIO)
{
printf("Couldn't find video stream information.(无法获取音频流信息)\n");
return -1;
}
const AVCodec *tmpCodec = avcodec_find_decoder(pFormatCtx_AudioMic->streams[0]->codecpar->codec_id);
pReadCodecCtx_AudioMic = avcodec_alloc_context3(tmpCodec);
pReadCodecCtx_AudioMic->sample_rate = select_sample_rate(tmpCodec);
pReadCodecCtx_AudioMic->channel_layout = select_channel_layout(tmpCodec);
pReadCodecCtx_AudioMic->channels = av_get_channel_layout_nb_channels(pReadCodecCtx_AudioMic->channel_layout);
pReadCodecCtx_AudioMic->sample_fmt = (AVSampleFormat)pFormatCtx_AudioMic->streams[0]->codecpar->format;
//pReadCodecCtx_Audio->sample_fmt = AV_SAMPLE_FMT_FLTP;
if (0 > avcodec_open2(pReadCodecCtx_AudioMic, tmpCodec, NULL))
{
printf("can not find or open audio decoder!\n");
}
avcodec_parameters_from_context(pFormatCtx_AudioMic->streams[0]->codecpar, pReadCodecCtx_AudioMic);
return 0;
}
int OpenOutPut()
{
AVStream *pAudioStream = NULL;
AVStream *pVideoStream = NULL;
const char *outFileName = "FfmpegAudioVideoAndMicTest.mp4";
avformat_alloc_output_context2(&pFormatCtx_Out, NULL, NULL, outFileName);
if (pFormatCtx_Video->streams[0]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
{
iVideoStreamIndex = 0;
pVideoStream = avformat_new_stream(pFormatCtx_Out, NULL);
if (!pVideoStream)
{
printf("can not new stream for output!\n");
return -1;
}
AVRational timeBase;
timeBase.num = 1;
timeBase.den = 25;
pVideoStream->time_base = timeBase;
AVCodec *pCodecEncode_Video = (AVCodec *)avcodec_find_encoder(pFormatCtx_Out->oformat->video_codec);
if (!(pCodecEncode_Video)) {
fprintf(stderr, "Could not find encoder for '%s'\n",
avcodec_get_name(AV_CODEC_ID_MPEG4));
exit(1);
}
pCodecEncodeCtx_Video = avcodec_alloc_context3(pCodecEncode_Video);
if (!pCodecEncodeCtx_Video) {
fprintf(stderr, "Could not alloc an encoding context\n");
exit(1);
}
pCodecEncodeCtx_Video->time_base = timeBase;
pCodecEncodeCtx_Video->codec_id = pFormatCtx_Out->oformat->video_codec;
pCodecEncodeCtx_Video->bit_rate = 400000;
/* Resolution must be a multiple of two. */
//pCodecEncodeCtx_Video->width = 352;
//pCodecEncodeCtx_Video->height = 288;
pCodecEncodeCtx_Video->width = 1920;
pCodecEncodeCtx_Video->height = 1080;
/* timebase: This is the fundamental unit of time (in seconds) in terms
* of which frame timestamps are represented. For fixed-fps content,
* timebase should be 1/framerate and timestamp increments should be
* identical to 1. */
pCodecEncodeCtx_Video->gop_size = 25; /* emit one intra frame every twelve frames at most */
pCodecEncodeCtx_Video->pix_fmt = AV_PIX_FMT_YUV420P;
if ((avcodec_open2(pCodecEncodeCtx_Video, pCodecEncode_Video, NULL)) < 0)
{
printf("can not open the encoder\n");
return -1;
}
}
if (pFormatCtx_Audio->streams[0]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
{
AVCodecContext *pOutputCodecCtx;
pAudioStream = avformat_new_stream(pFormatCtx_Out, NULL);
iAudioStreamIndex = 1;
pCodecEncode_Audio = (AVCodec *)avcodec_find_encoder(pFormatCtx_Out->oformat->audio_codec);
pCodecEncodeCtx_Audio = avcodec_alloc_context3(pCodecEncode_Audio);
if (!pCodecEncodeCtx_Audio) {
fprintf(stderr, "Could not alloc an encoding context\n");
exit(1);
}
//pCodecEncodeCtx_Audio->codec_id = pFormatCtx_Out->oformat->audio_codec;
pCodecEncodeCtx_Audio->sample_fmt = pCodecEncode_Audio->sample_fmts ? pCodecEncode_Audio->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
pCodecEncodeCtx_Audio->bit_rate = 64000;
pCodecEncodeCtx_Audio->sample_rate = 44100;
if (pCodecEncode_Audio->supported_samplerates) {
pCodecEncodeCtx_Audio->sample_rate = pCodecEncode_Audio->supported_samplerates[0];
for (int i = 0; pCodecEncode_Audio->supported_samplerates[i]; i++) {
if (pCodecEncode_Audio->supported_samplerates[i] == 44100)
pCodecEncodeCtx_Audio->sample_rate = 44100;
}
}
pCodecEncodeCtx_Audio->channels = av_get_channel_layout_nb_channels(pCodecEncodeCtx_Audio->channel_layout);
pCodecEncodeCtx_Audio->channel_layout = AV_CH_LAYOUT_STEREO;
if (pCodecEncode_Audio->channel_layouts) {
pCodecEncodeCtx_Audio->channel_layout = pCodecEncode_Audio->channel_layouts[0];
for (int i = 0; pCodecEncode_Audio->channel_layouts[i]; i++) {
if (pCodecEncode_Audio->channel_layouts[i] == AV_CH_LAYOUT_STEREO)
pCodecEncodeCtx_Audio->channel_layout = AV_CH_LAYOUT_STEREO;
}
}
pCodecEncodeCtx_Audio->channels = av_get_channel_layout_nb_channels(pCodecEncodeCtx_Audio->channel_layout);
AVRational timeBase;
timeBase.den = pCodecEncodeCtx_Audio->sample_rate;
timeBase.num = 1;
pAudioStream->time_base = timeBase;
if (avcodec_open2(pCodecEncodeCtx_Audio, pCodecEncode_Audio, 0) < 0)
{
//编码器打开失败,退出程序
return -1;
}
}
if (!(pFormatCtx_Out->oformat->flags & AVFMT_NOFILE))
{
if (avio_open(&pFormatCtx_Out->pb, outFileName, AVIO_FLAG_WRITE) < 0)
{
printf("can not open output file handle!\n");
return -1;
}
}
avcodec_parameters_from_context(pVideoStream->codecpar, pCodecEncodeCtx_Video);
avcodec_parameters_from_context(pAudioStream->codecpar, pCodecEncodeCtx_Audio);
if (avformat_write_header(pFormatCtx_Out, NULL) < 0)
{
printf("can not write the header of the output file!\n");
return -1;
}
return 0;
}
int main(int argc, char* argv[])
{
int ret = 0;
AVSampleFormat sample_fmt = AV_SAMPLE_FMT_S16;
int iSize = av_get_bytes_per_sample(sample_fmt);
avdevice_register_all();
audio_convert_ctx = swr_alloc();
av_opt_set_channel_layout(audio_convert_ctx, "in_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_channel_layout(audio_convert_ctx, "out_channel_layout", AV_CH_LAYOUT_STEREO, 0);
av_opt_set_int(audio_convert_ctx, "in_sample_rate", 44100, 0);
av_opt_set_int(audio_convert_ctx, "out_sample_rate", 44100, 0);
av_opt_set_sample_fmt(audio_convert_ctx, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0);
//av_opt_set_sample_fmt(audio_convert_ctx, "in_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
av_opt_set_sample_fmt(audio_convert_ctx, "out_sample_fmt", AV_SAMPLE_FMT_FLTP, 0);
ret = swr_init(audio_convert_ctx);
if (OpenVideoCapture() < 0)
{
return -1;
}
if (OpenAudioCapture() < 0)
{
return -1;
}
if (OpenAudioMicCapture() < 0)
{
return -1;
}
if (OpenOutPut() < 0)
{
return -1;
}
const char* filter_desc = "[in0][in1]amix=inputs=2[out]";
ret = InitFilter(filter_desc);
if (ret < 0)
{
return -1;
}
InitializeCriticalSection(&VideoSection);
InitializeCriticalSection(&AudioSection);
InitializeCriticalSection(&AudioSection_mic);
InitializeCriticalSection(&AudioSection_mix);
AVFrame *pFrameYUVInMain = av_frame_alloc();
uint8_t *out_buffer_yuv420 = (uint8_t *)av_malloc(yuv420_frame_size);
av_image_fill_arrays(pFrameYUVInMain->data, pFrameYUVInMain->linesize, out_buffer_yuv420, AV_PIX_FMT_YUV420P, pReadCodecCtx_Video->width, pReadCodecCtx_Video->height, 1);
int AudioFrameIndex_mic = 1;
AVPacket packet = { 0 };
int iPicCount = 1;
HANDLE hThreadAudio = CreateThread(NULL, 0, AudioCapThreadProc, 0, 0, NULL);
HANDLE hThreadAudioMic = CreateThread(NULL, 0, AudioMicCapThreadProc, 0, 0, NULL);
HANDLE hThreadAudioMix = CreateThread(NULL, 0, AudioMixThreadProc, 0, 0, NULL);
HANDLE hThreadVideo = CreateThread(NULL, 0, ScreenCapThreadProc, 0, 0, NULL);
while (bCap)
{
if (NULL == fifo_video)
{
continue;
}
if (NULL == fifo_audio_mix)
{
continue;
}
if (av_compare_ts(cur_pts_v, pFormatCtx_Out->streams[iVideoStreamIndex]->time_base,
cur_pts_a, pFormatCtx_Out->streams[iAudioStreamIndex]->time_base) <= 0)
{
if (av_fifo_size(fifo_video) >= yuv420_frame_size)
{
EnterCriticalSection(&VideoSection);
av_fifo_generic_read(fifo_video, out_buffer_yuv420, yuv420_frame_size, NULL);
LeaveCriticalSection(&VideoSection);
packet.pts = iPicCount;
packet.dts = iPicCount;
av_packet_rescale_ts(&packet, pReadCodecCtx_Video->time_base, pFormatCtx_Out->streams[0]->time_base);
pFrameYUVInMain->width = pReadCodecCtx_Video->width;
pFrameYUVInMain->height = pReadCodecCtx_Video->height;
pFrameYUVInMain->format = AV_PIX_FMT_YUV420P;
pFrameYUVInMain->pts = packet.pts;
pFrameYUVInMain->pkt_dts = packet.pts;
cur_pts_v = packet.pts;
av_packet_unref(&packet);
ret = avcodec_send_frame(pCodecEncodeCtx_Video, pFrameYUVInMain);
ret = avcodec_receive_packet(pCodecEncodeCtx_Video, &packet);
ret = av_interleaved_write_frame(pFormatCtx_Out, &packet);
avio_flush(pFormatCtx_Out->pb);
iPicCount++;
if (iPicCount > 1000)
{
bCap = false;
}
}
}
else
{
if (av_audio_fifo_size(fifo_audio_mix) >=
(pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size > 0 ? pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size : 1024))
{
AVFrame *frame_mix = NULL;
frame_mix = av_frame_alloc();
frame_mix->nb_samples = pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size > 0 ? pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size : 1024;
frame_mix->channel_layout = pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->channel_layout;
//frame_mix->format = pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->format;
frame_mix->format = 1;
frame_mix->sample_rate = pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->sample_rate;
av_frame_get_buffer(frame_mix, 0);
EnterCriticalSection(&AudioSection_mix);
int readcount = av_audio_fifo_read(fifo_audio_mix, (void **)frame_mix->data,
(pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size > 0 ? pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size : 1024));
LeaveCriticalSection(&AudioSection_mix);
AVPacket pkt_out_mic = { 0 };
pkt_out_mic.data = NULL;
pkt_out_mic.size = 0;
frame_mix->pts = AudioFrameIndex_mic * pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size;
AVFrame *frame_mic_encode = NULL;
frame_mic_encode = av_frame_alloc();
frame_mic_encode->nb_samples = pCodecEncodeCtx_Audio->frame_size;
frame_mic_encode->channel_layout = pCodecEncodeCtx_Audio->channel_layout;
frame_mic_encode->format = pCodecEncodeCtx_Audio->sample_fmt;
frame_mic_encode->sample_rate = pCodecEncodeCtx_Audio->sample_rate;
av_frame_get_buffer(frame_mic_encode, 0);
int dst_nb_samples = av_rescale_rnd(swr_get_delay(audio_convert_ctx, frame_mix->sample_rate) + frame_mix->nb_samples, frame_mix->sample_rate, frame_mix->sample_rate, AVRounding(1));
//uint8_t *audio_buf = NULL;
uint8_t *audio_buf[2] = { 0 };
audio_buf[0] = (uint8_t *)frame_mic_encode->data[0];
audio_buf[1] = (uint8_t *)frame_mic_encode->data[1];
int nb = swr_convert(audio_convert_ctx, audio_buf, dst_nb_samples, (const uint8_t**)frame_mix->data, frame_mix->nb_samples);
ret = avcodec_send_frame(pCodecEncodeCtx_Audio, frame_mic_encode);
ret = avcodec_receive_packet(pCodecEncodeCtx_Audio, &pkt_out_mic);
if (ret == AVERROR(EAGAIN))
{
continue;
}
av_frame_free(&frame_mix);
av_frame_free(&frame_mic_encode);
{
pkt_out_mic.stream_index = iAudioStreamIndex;
pkt_out_mic.pts = AudioFrameIndex_mic * pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size;
pkt_out_mic.dts = AudioFrameIndex_mic * pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size;
pkt_out_mic.duration = pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size;
cur_pts_a = pkt_out_mic.pts;
int ret2 = av_interleaved_write_frame(pFormatCtx_Out, &pkt_out_mic);
av_packet_unref(&pkt_out_mic);
}
AudioFrameIndex_mic++;
}
}
}
Sleep(100);
av_write_trailer(pFormatCtx_Out);
avio_close(pFormatCtx_Out->pb);
avformat_free_context(pFormatCtx_Out);
WaitForSingleObject(hThreadAudio, 3000);
WaitForSingleObject(hThreadAudioMic, 3000);
if (pFormatCtx_Audio != NULL)
{
avformat_close_input(&pFormatCtx_Audio);
pFormatCtx_Audio = NULL;
}
return 0;
}
DWORD WINAPI AudioCapThreadProc(LPVOID lpParam)
{
AVFrame *pFrame;
pFrame = av_frame_alloc();
AVPacket packet = { 0 };
int ret = 0;
while (bCap)
{
av_packet_unref(&packet);
if (av_read_frame(pFormatCtx_Audio, &packet) < 0)
{
continue;
}
ret = avcodec_send_packet(pReadCodecCtx_Audio, &packet);
if (ret >= 0)
{
ret = avcodec_receive_frame(pReadCodecCtx_Audio, pFrame);
if (ret == AVERROR(EAGAIN))
{
break;
}
else if (ret == AVERROR_EOF)
{
return 0;
}
else if (ret < 0) {
fprintf(stderr, "Error during decoding\n");
exit(1);
}
if (NULL == fifo_audio)
{
fifo_audio = av_audio_fifo_alloc((AVSampleFormat)pFormatCtx_Audio->streams[0]->codecpar->format,
pFormatCtx_Audio->streams[0]->codecpar->channels, 30 * pFrame->nb_samples);
}
int buf_space = av_audio_fifo_space(fifo_audio);
if (av_audio_fifo_space(fifo_audio) >= pFrame->nb_samples)
{
//AudioSection
EnterCriticalSection(&AudioSection);
ret = av_audio_fifo_write(fifo_audio, (void **)pFrame->data, pFrame->nb_samples);
LeaveCriticalSection(&AudioSection);
}
av_packet_unref(&packet);
}
}
return 0;
}
DWORD WINAPI AudioMicCapThreadProc(LPVOID lpParam)
{
AVFrame *pFrame;
pFrame = av_frame_alloc();
AVPacket packet = { 0 };
int ret = 0;
while (bCap)
{
av_packet_unref(&packet);
if (av_read_frame(pFormatCtx_AudioMic, &packet) < 0)
{
continue;
}
ret = avcodec_send_packet(pReadCodecCtx_AudioMic, &packet);
if (ret >= 0)
{
ret = avcodec_receive_frame(pReadCodecCtx_AudioMic, pFrame);
if (ret == AVERROR(EAGAIN))
{
break;
}
else if (ret == AVERROR_EOF)
{
return 0;
}
else if (ret < 0) {
fprintf(stderr, "Error during decoding\n");
exit(1);
}
if (NULL == fifo_audio_mic)
{
fifo_audio_mic = av_audio_fifo_alloc((AVSampleFormat)pFormatCtx_AudioMic->streams[0]->codecpar->format,
pFormatCtx_AudioMic->streams[0]->codecpar->channels, 30 * pFrame->nb_samples);
}
int buf_space = av_audio_fifo_space(fifo_audio_mic);
if (av_audio_fifo_space(fifo_audio_mic) >= pFrame->nb_samples)
{
EnterCriticalSection(&AudioSection_mic);
ret = av_audio_fifo_write(fifo_audio_mic, (void **)pFrame->data, pFrame->nb_samples);
LeaveCriticalSection(&AudioSection_mic);
}
av_packet_unref(&packet);
}
}
return 0;
}
DWORD WINAPI AudioMixThreadProc(LPVOID lpParam)
{
int ret = 0;
AVFrame *frame_audio = NULL;
frame_audio = av_frame_alloc();
AVFrame *frame_audio_mic = NULL;
frame_audio_mic = av_frame_alloc();
if (NULL == fifo_audio_mix)
{
fifo_audio_mix = av_audio_fifo_alloc((AVSampleFormat)pFormatCtx_Audio->streams[0]->codecpar->format,
pFormatCtx_Audio->streams[0]->codecpar->channels, 30 * 1024);
}
while (bCap)
{
if (NULL == fifo_audio)
{
continue;
}
if (NULL == fifo_audio_mic)
{
continue;
}
int fifo_inner_size = av_audio_fifo_size(fifo_audio);
int fifo_mic_size = av_audio_fifo_size(fifo_audio_mic);
//int frame_inner_min_size = pReadCodecCtx_Audio->frame_size;
//int frame_mic_min_size = pReadCodecCtx_AudioMic->frame_size;
int frame_inner_min_size = 1024;
int frame_mic_min_size = 1024;
if (fifo_inner_size >= frame_inner_min_size && fifo_mic_size >= frame_mic_min_size)
{
frame_audio->nb_samples = frame_inner_min_size;
frame_audio->channel_layout = 3;
frame_audio->format = pFormatCtx_Audio->streams[0]->codecpar->format;
frame_audio->sample_rate = pFormatCtx_Audio->streams[0]->codecpar->sample_rate;
av_frame_get_buffer(frame_audio, 0);
frame_audio_mic->nb_samples = frame_mic_min_size;
frame_audio_mic->channel_layout = 3;
frame_audio_mic->format = pFormatCtx_AudioMic->streams[0]->codecpar->format;
frame_audio_mic->sample_rate = pFormatCtx_AudioMic->streams[0]->codecpar->sample_rate;
av_frame_get_buffer(frame_audio_mic, 0);
EnterCriticalSection(&AudioSection);
int readcount = av_audio_fifo_read(fifo_audio, (void **)frame_audio->data, frame_inner_min_size);
LeaveCriticalSection(&AudioSection);
EnterCriticalSection(&AudioSection_mic);
readcount = av_audio_fifo_read(fifo_audio_mic, (void**)frame_audio_mic->data, frame_mic_min_size);
LeaveCriticalSection(&AudioSection_mic);
frame_audio->pts = AudioFrameIndex_mix * pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size;
frame_audio_mic->pts = AudioFrameIndex_mix * pFormatCtx_Out->streams[iAudioStreamIndex]->codecpar->frame_size;
BufferSourceContext* s = (BufferSourceContext*)_filter_ctx_src_inner->priv;
bool b1 = (s->sample_fmt != frame_audio->format);
bool b2 = (s->sample_rate != frame_audio->sample_rate);
bool b3 = (s->channel_layout != frame_audio->channel_layout);
bool b4 = (s->channels != frame_audio->channels);
ret = av_buffersrc_add_frame(_filter_ctx_src_inner, frame_audio);
if (ret < 0)
{
printf("Mixer: failed to call av_buffersrc_add_frame (speaker)\n");
break;
}
ret = av_buffersrc_add_frame(_filter_ctx_src_mic, frame_audio_mic);
if (ret < 0)
{
printf("Mixer: failed to call av_buffersrc_add_frame (microphone)\n");
break;
}
while (1)
{
AVFrame* pFrame_out = av_frame_alloc();
ret = av_buffersink_get_frame_flags(_filter_ctx_sink, pFrame_out, 0);
if (ret < 0)
{
av_frame_free(&pFrame_out);
//printf("Mixer: failed to call av_buffersink_get_frame_flags\n");
break;
}
AudioFrameIndex_mix++;
EnterCriticalSection(&AudioSection_mix);
ret = av_audio_fifo_write(fifo_audio_mix, (void **)pFrame_out->data, pFrame_out->nb_samples);
LeaveCriticalSection(&AudioSection_mix);
av_frame_free(&pFrame_out);
}
}
}
av_frame_free(&frame_audio);
av_frame_free(&frame_audio_mic);
return 0;
}
DWORD WINAPI ScreenCapThreadProc(LPVOID lpParam)
{
AVFrame *pFrame;
pFrame = av_frame_alloc();
AVFrame *pFrameYUV = av_frame_alloc();
int frame_size = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pReadCodecCtx_Video->width, pReadCodecCtx_Video->height, 1);
uint8_t *out_buffer_yuv420 = (uint8_t *)av_malloc(frame_size);
av_image_fill_arrays(pFrameYUV->data, pFrameYUV->linesize, out_buffer_yuv420, AV_PIX_FMT_YUV420P, pReadCodecCtx_Video->width, pReadCodecCtx_Video->height, 1);
int y_size = pReadCodecCtx_Video->width * pReadCodecCtx_Video->height;
AVPacket packet = { 0 };
int ret = 0;
while (bCap)
{
av_packet_unref(&packet);
if (av_read_frame(pFormatCtx_Video, &packet) < 0)
{
continue;
}
ret = avcodec_send_packet(pReadCodecCtx_Video, &packet);
if (ret >= 0)
{
ret = avcodec_receive_frame(pReadCodecCtx_Video, pFrame);
if (ret == AVERROR(EAGAIN))
{
continue;
}
else if (ret == AVERROR_EOF)
{
break;
}
else if (ret < 0) {
fprintf(stderr, "Error during decoding\n");
break;
}
int iScale = sws_scale(img_convert_ctx, (const uint8_t* const*)pFrame->data, pFrame->linesize, 0, pCodecEncodeCtx_Video->height, pFrameYUV->data, pFrameYUV->linesize);
if (av_fifo_space(fifo_video) >= frame_size)
{
EnterCriticalSection(&VideoSection);
av_fifo_generic_write(fifo_video, pFrameYUV->data[0], y_size, NULL);
av_fifo_generic_write(fifo_video, pFrameYUV->data[1], y_size / 4, NULL);
av_fifo_generic_write(fifo_video, pFrameYUV->data[2], y_size / 4, NULL);
LeaveCriticalSection(&VideoSection);
}
}
if (ret == AVERROR(EAGAIN))
{
continue;
}
}
av_frame_free(&pFrame);
av_frame_free(&pFrameYUV);
return 0;
}