live555支持mpeg4的ES(Elemental Stream)流,相关类为MPEGVideoStreamFramer、MPEG4ESVideoRTPSink。我想扩展其对avi格式的支持,将avi中的MPEG4数据包解析出来后,交给MPEGVideoStreamFramer进行处理。后来发现,这样根本不行。问题在于,MPEGVideoStreamFramer处理的是严格的MPEG4 ES流。
先简单的说明一下MPEG4的ES流:
MPEG4 Elemental stream 组成如下:
VOS->VO->VOL->GOV(可选)->VOP
VOS 视觉对像序列
VO 视觉对像
VOL 视觉对对象层
GOV 视觉对象平面组(VOP组)
VOP 视觉对象平面
紧跟着VOP开始的,有一个2bit 的标志,用来表示这个Frame到底是一个 I Frame,P Frame,B Frame抑或是S Frame(GMS-VOP)
标志如下:
00: I Frame
01: P Frame
10: B Frame
11: S Frame
起始符及结束符定义如下:
#define VISUAL_OBJECT_SEQUENCE_START_CODE 0x000001B0
#define VISUAL_OBJECT_SEQUENCE_END_CODE 0x000001B1
#define GROUP_VOP_START_CODE 0x000001B3
#define VISUAL_OBJECT_START_CODE 0x000001B5
#define VOP_START_CODE 0x000001B6
用二进制方式打开avi文件,发现只存在vop开始符,说明只存在VOP层次,而不是严格的ES流。可以认为一个VOP对应着一个帧。
后来发现,live555中实现了另一个类,MPEG4VideoStreamDiscreteFramer, 继承自MPEG4VideoStreamFramer。它可以处理VOS,也可以处理一个个的BOV及VOP,正好可以满足需求。
看一下MPEG4VideoStreamDiscreteFramer对MPEG4数据的处理
void MPEG4VideoStreamDiscreteFramer
::afterGettingFrame1(unsigned frameSize, unsigned numTruncatedBytes,
struct timeval presentationTime,
unsigned durationInMicroseconds) {
// Check that the first 4 bytes are a system code:
if (frameSize >= 4 && fTo[0] == 0 && fTo[1] == 0 && fTo[2] == 1) {
fPictureEndMarker = True; // Assume that we have a complete 'picture' here
unsigned i = 3;
//
//视觉对象序列,按照完整的MPEG4 Elemental Stream进行解析
//
if (fTo[i] == 0xB0) { // VISUAL_OBJECT_SEQUENCE_START_CODE
// The next byte is the "profile_and_level_indication":
if (frameSize >= 5) fProfileAndLevelIndication = fTo[4];
// The start of this frame - up to the first GROUP_VOP_START_CODE
// or VOP_START_CODE - is stream configuration information. Save this:
for (i = 7; i < frameSize; ++i) {
if ((fTo[i] == 0xB3 /*GROUP_VOP_START_CODE*/ ||
fTo[i] == 0xB6 /*VOP_START_CODE*/)
&& fTo[i-1] == 1 && fTo[i-2] == 0 && fTo[i-3] == 0) {
break; // The configuration information ends here
}
}
fNumConfigBytes = i < frameSize ? i-3 : frameSize;
delete[] fConfigBytes; fConfigBytes = new unsigned char[fNumConfigBytes];
for (unsigned j = 0; j < fNumConfigBytes; ++j) fConfigBytes[j] = fTo[j];
// This information (should) also contain a VOL header, which we need
// to analyze, to get "vop_time_increment_resolution" (which we need
// - along with "vop_time_increment" - in order to generate accurate
// presentation times for "B" frames).
analyzeVOLHeader();
}
if (i < frameSize) {
u_int8_t nextCode = fTo[i];
//
//VOP组
//
if (nextCode == 0xB3 /*GROUP_VOP_START_CODE*/) {
// Skip to the following VOP_START_CODE (if any):
for (i += 4; i < frameSize; ++i) {
if (fTo[i] == 0xB6 /*VOP_START_CODE*/
&& fTo[i-1] == 1 && fTo[i-2] == 0 && fTo[i-3] == 0) {
nextCode = fTo[i];
break;
}
}
}
//
//视觉对象平面
//
if (nextCode == 0xB6 /*VOP_START_CODE*/ && i+5 < frameSize) {
++i;
// Get the "vop_coding_type" from the next byte:
u_int8_t nextByte = fTo[i++];
u_int8_t vop_coding_type = nextByte>>6; //VOP开始符后的2bit,表示帧类型I/P/B/S
// Next, get the "modulo_time_base" by counting the '1' bits that
// follow. We look at the next 32-bits only.
// This should be enough in most cases.
u_int32_t next4Bytes
= (fTo[i]<<24)|(fTo[i+1]<<16)|(fTo[i+2]<<8)|fTo[i+3];
i += 4;
u_int32_t timeInfo = (nextByte<<(32-6))|(next4Bytes>>6);
unsigned modulo_time_base = 0;
u_int32_t mask = 0x80000000;
while ((timeInfo&mask) != 0) {
++modulo_time_base;
mask >>= 1;
}
mask >>= 2;
// Then, get the "vop_time_increment".
unsigned vop_time_increment = 0;
// First, make sure we have enough bits left for this:
if ((mask>>(fNumVTIRBits-1)) != 0) {
for (unsigned i = 0; i < fNumVTIRBits; ++i) {
vop_time_increment |= timeInfo&mask;
mask >>= 1;
}
while (mask != 0) {
vop_time_increment >>= 1;
mask >>= 1;
}
}
//
//若是"B"frame, 需要修正时间时间戳
//
// If this is a "B" frame, then we have to tweak "presentationTime":
if (vop_coding_type == 2/*B*/
&& (fLastNonBFramePresentationTime.tv_usec > 0 ||
fLastNonBFramePresentationTime.tv_sec > 0)) {
int timeIncrement
= fLastNonBFrameVop_time_increment - vop_time_increment;
if (timeIncrement<0) timeIncrement += vop_time_increment_resolution;
unsigned const MILLION = 1000000;
double usIncrement = vop_time_increment_resolution == 0 ? 0.0
: ((double)timeIncrement*MILLION)/vop_time_increment_resolution;
unsigned secondsToSubtract = (unsigned)(usIncrement/MILLION);
unsigned uSecondsToSubtract = ((unsigned)usIncrement)%MILLION;
presentationTime = fLastNonBFramePresentationTime;
if ((unsigned)presentationTime.tv_usec < uSecondsToSubtract) {
presentationTime.tv_usec += MILLION;
if (presentationTime.tv_sec > 0) --presentationTime.tv_sec;
}
presentationTime.tv_usec -= uSecondsToSubtract;
if ((unsigned)presentationTime.tv_sec > secondsToSubtract) {
presentationTime.tv_sec -= secondsToSubtract;
} else {
presentationTime.tv_sec = presentationTime.tv_usec = 0;
}
} else {
fLastNonBFramePresentationTime = presentationTime;
fLastNonBFrameVop_time_increment = vop_time_increment;
}
}
}
}
// Complete delivery to the client:
fFrameSize = frameSize;
fNumTruncatedBytes = numTruncatedBytes;
fPresentationTime = presentationTime;
fDurationInMicroseconds = durationInMicroseconds;
afterGetting(this);
}
上面的代码,其实只完成一个功能,就是当当前VOP为B帧时,调整时间戳。
最后关注一下,MPEG4 ES流时间戳的处理。 在处理MPEG4 的ES流时,使用MPEG4VideoStreamFramer,作为source。使用分析器MPEG4VideoStreamParser,对完整的MPEG4 Elemental Stream进行分析,主要是解析出其中的时间信息。
void MPEGVideoStreamFramer::continueReadProcessing() {
unsigned acquiredFrameSize = fParser->parse();
if (acquiredFrameSize > 0) {
// We were able to acquire a frame from the input.
// It has already been copied to the reader's space.
fFrameSize = acquiredFrameSize;
fNumTruncatedBytes = fParser->numTruncatedBytes();
// "fPresentationTime" should have already been computed.
//
//根据帧计数及帧率计算帧的持续时间
//
// Compute "fDurationInMicroseconds" now:
fDurationInMicroseconds
= (fFrameRate == 0.0 || ((int)fPictureCount) < 0) ? 0
: (unsigned)((fPictureCount*1000000)/fFrameRate);
fPictureCount = 0;
// Call our own 'after getting' function. Because we're not a 'leaf'
// source, we can call this directly, without risking infinite recursion.
afterGetting(this);
} else {
// We were unable to parse a complete frame from the input, because:
// - we had to read more data from the source stream, or
// - the source stream has ended.
}
}
计算fDurationInMicroseconds需要frame rate参数fFrameRate, 它是通过分析VOL头确定的
void MPEG4VideoStreamParser::analyzeVOLHeader() {
//
//从VOL中解析出时间信息
//
// Extract timing information (in particular,
// "vop_time_increment_resolution") from the VOL Header:
...
do {
...
// Use "vop_time_increment_resolution" as the 'frame rate'
// (really, 'tick rate'):
usingSource()->fFrameRate = (double)vop_time_increment_resolution; //帧率
return;
} while (0);
...
}