在开始编写读取文件时,先简单介绍下解析策略,对于xml文件:
<?This is a Declaration!?>
<!--This is a Document Comment!-->
<School>
<!--This is a School Comment!-->
<teachers>
<Wang/>
<Li/>
<Zhao/>
</teachers>
<students>
<LiMing sex="male" height="174" weight="80.400000000000006" Is_good_at_math="false">Li Ming is a good Student!</LiMing>
<CuiHua>this is a Text!</CuiHua>
<Hanmeimei><![CDATA[this is a CData Text:if (a < b && a < 0)]]></Hanmeimei>
</students>
<!this is a Unknow!>
</School>
我们要建立的DOM树如下图所示,
不同节点的头是不同的,如下:
XMLDeclaration:<?
XMLComment: <!--
XMLText(CData):<![CDATA[
XMLUnknown:<!
XMLElement:<
XMLText:剩下的
在解析时,把<xxx> ...</xxx>这样类型的Element,第一个<xxx>为父节点,最后一个</xxx>将是<xxx>的_lastChild,最后再删除</xxx>。
对于父节点,我们使用如下策略:1)我们从头开始遍历文件,如果碰到 <?,创建XMLDeclaration,继续搜索?>,使用InsertEndChild插入;
2)如果碰到 <!-- ,创建XMLComment,继续搜索-->,使用InsertEndChild插入;
3)如果碰到<![CDATA[,创建XMLText,继续搜索]]>,使用InsertEndChild插入;
4)如果碰到 <!,创建XMLUnknown,继续搜索>,使用InsertEndChild插入;
5)如果碰到 <,创建XMLElemet,接着解析
if(/xxx>类型){
则标记状态为CLOSING,意味着某个Element结束了。使用InsertEndChild插入;并且和父类XMLElemet的名称做对比,相等则删除该XMLElement。
}
else{
使用InsertEndChild插入;解析Attributes,如果以/>结尾,该节点解析完毕。否则,将当前节点设为父节点,跳到1)开始解析它的子节点。
}
6)XMLText,搜索<,返回<之前的指针即可。
从上面的逻辑可以看出,需要递归解析,而每个节点类型的解析方式也不同,所以需要一个多态的解析接口,在XMLNode中增加解析接口ParseDeep(源代码的子类重载的时候并没有加virtual,不是个好习惯),其实XMLNode的搜索,就是父节点的搜索策略:
virtual char* ParseDeep( char*, StrPair* );
先定义搜索节点头函数:
char* XMLDocument::Identify( char* p, XMLNode** node )
{
TIXMLASSERT( node );
TIXMLASSERT( p );
char* const start = p;
p = XMLUtil::SkipWhiteSpace( p );
if( !*p ) {
*node = 0;
TIXMLASSERT( p );
return p;
}
// These strings define the matching patterns:
static const char* xmlHeader = { "<?" };
static const char* commentHeader = { "<!--" };
static const char* cdataHeader = { "<![CDATA[" };
static const char* dtdHeader = { "<!" };
static const char* elementHeader = { "<" }; // and a header for everything else; check last.
static const int xmlHeaderLen = 2;
static const int commentHeaderLen = 4;
static const int cdataHeaderLen = 9;
static const int dtdHeaderLen = 2;
static const int elementHeaderLen = 1;
TIXMLASSERT( sizeof( XMLComment ) == sizeof( XMLUnknown ) ); // use same memory pool
TIXMLASSERT( sizeof( XMLComment ) == sizeof( XMLDeclaration ) ); // use same memory pool
XMLNode* returnNode = 0;
if ( XMLUtil::StringEqual( p, xmlHeader, xmlHeaderLen ) ) {
TIXMLASSERT( sizeof( XMLDeclaration ) == _commentPool.ItemSize() );
returnNode = new (_commentPool.Alloc()) XMLDeclaration( this );
returnNode->_memPool = &_commentPool;
p += xmlHeaderLen;
}
else if ( XMLUtil::StringEqual( p, commentHeader, commentHeaderLen ) ) {
TIXMLASSERT( sizeof( XMLComment ) == _commentPool.ItemSize() );
returnNode = new (_commentPool.Alloc()) XMLComment( this );
returnNode->_memPool = &_commentPool;
p += commentHeaderLen;
}
else if ( XMLUtil::StringEqual( p, cdataHeader, cdataHeaderLen ) ) {
TIXMLASSERT( sizeof( XMLText ) == _textPool.ItemSize() );
XMLText* text = new (_textPool.Alloc()) XMLText( this );
returnNode = text;
returnNode->_memPool = &_textPool;
p += cdataHeaderLen;
text->SetCData( true );
}
else if ( XMLUtil::StringEqual( p, dtdHeader, dtdHeaderLen ) ) {
TIXMLASSERT( sizeof( XMLUnknown ) == _commentPool.ItemSize() );
returnNode = new (_commentPool.Alloc()) XMLUnknown( this );
returnNode->_memPool = &_commentPool;
p += dtdHeaderLen;
}
else if ( XMLUtil::StringEqual( p, elementHeader, elementHeaderLen ) ) {
TIXMLASSERT( sizeof( XMLElement ) == _elementPool.ItemSize() );
returnNode = new (_elementPool.Alloc()) XMLElement( this );
returnNode->_memPool = &_elementPool;
p += elementHeaderLen;
}
else {
TIXMLASSERT( sizeof( XMLText ) == _textPool.ItemSize() );
returnNode = new (_textPool.Alloc()) XMLText( this );
returnNode->_memPool = &_textPool;
p = start; // Back it up, all the text counts.
}
TIXMLASSERT( returnNode );
TIXMLASSERT( p );
*node = returnNode;
return p;
}
下面看XMLNode的搜索,
char* XMLNode::ParseDeep( char* p, StrPair* parentEnd )
{
while( p && *p ) {
XMLNode* node = 0;
p = _document->Identify( p, &node );//搜索节点头,根据节点头创建对应的子节点
if ( node == 0 ) {
break;
}
StrPair endTag;
p = node->ParseDeep( p, &endTag );//由子节点根据自己的策略完成搜索
if ( !p ) {
DeleteNode( node );
if ( !_document->Error() ) {
_document->SetError( XML_ERROR_PARSING, 0, 0 );
}
break;
}
XMLDeclaration* decl = node->ToDeclaration();
if ( decl ) {//如果有声明,必须是文档的第一个孩子
// A declaration can only be the first child of a document.
// Set error, if document already has children.
if ( !_document->NoChildren() ) {
_document->SetError( XML_ERROR_PARSING_DECLARATION, decl->Value(), 0);
DeleteNode( node );
break;
}
}
XMLElement* ele = node->ToElement();
if ( ele ) {
// 解析到</xxx>了,直接返回,当前Element的解析结束
if ( ele->ClosingType() == XMLElement::CLOSING ) {
if ( parentEnd ) {
ele->_value.TransferTo( parentEnd );
}
node->_memPool->SetTracked(); // created and then immediately deleted.
DeleteNode( node );
return p;
}
// Handle an end tag returned to this level.
// And handle a bunch of annoying errors.
bool mismatch = false;
if ( endTag.Empty() ) {
if ( ele->ClosingType() == XMLElement::OPEN ) {
mismatch = true;
}
}
else {
if ( ele->ClosingType() != XMLElement::OPEN ) {
mismatch = true;
}//下面比较Element是否对应
else if ( !XMLUtil::StringEqual( endTag.GetStr(), ele->Name() ) ) {
mismatch = true;
}
}
if ( mismatch ) {
_document->SetError( XML_ERROR_MISMATCHED_ELEMENT, ele->Name(), 0 );
DeleteNode( node );
break;
}
}
InsertEndChild( node );//插入子节点
}
return 0;
}
I
XMLText的搜索策略
// --------- XMLText ---------- //
char* XMLText::ParseDeep( char* p, StrPair* )
{
const char* start = p;
if ( this->CData() ) {
p = _value.ParseText( p, "]]>", StrPair::NEEDS_NEWLINE_NORMALIZATION );
if ( !p ) {
_document->SetError( XML_ERROR_PARSING_CDATA, start, 0 );
}
return p;
}
else {
int flags = _document->ProcessEntities() ? StrPair::TEXT_ELEMENT : StrPair::TEXT_ELEMENT_LEAVE_ENTITIES;
if ( _document->WhitespaceMode() == COLLAPSE_WHITESPACE ) {
flags |= StrPair::NEEDS_WHITESPACE_COLLAPSING;
}
p = _value.ParseText( p, "<", flags );
if ( p && *p ) {
return p-1;
}
if ( !p ) {
_document->SetError( XML_ERROR_PARSING_TEXT, start, 0 );
}
}
return 0;
}
XMLComment:
char* XMLComment::ParseDeep( char* p, StrPair* )
{
// Comment parses as text.
const char* start = p;
p = _value.ParseText( p, "-->", StrPair::COMMENT );
if ( p == 0 ) {
_document->SetError( XML_ERROR_PARSING_COMMENT, start, 0 );
}
return p;
}
XMLDeclaration:
char* XMLDeclaration::ParseDeep( char* p, StrPair* )
{
// Declaration parses as text.
const char* start = p;
p = _value.ParseText( p, "?>", StrPair::NEEDS_NEWLINE_NORMALIZATION );
if ( p == 0 ) {
_document->SetError( XML_ERROR_PARSING_DECLARATION, start, 0 );
}
return p;
}
XMLUnknown:
char* XMLUnknown::ParseDeep( char* p, StrPair* )
{
// Unknown parses as text.
const char* start = p;
p = _value.ParseText( p, ">", StrPair::NEEDS_NEWLINE_NORMALIZATION );
if ( !p ) {
_document->SetError( XML_ERROR_PARSING_UNKNOWN, start, 0 );
}
return p;
}
XMLElement:
char* XMLElement::ParseDeep( char* p, StrPair* strPair )
{
// Read the element name.
p = XMLUtil::SkipWhiteSpace( p );
// The closing element is the </element> form. It is
// parsed just like a regular element then deleted from
// the DOM.
if ( *p == '/' ) {
_closingType = CLOSING;
++p;
}
p = _value.ParseName( p );
if ( _value.Empty() ) {
return 0;
}
p = ParseAttributes( p );
if ( !p || !*p || _closingType ) {
return p;
}
<span style="white-space:pre"> </span>//调用XMLNode中的解析方法,就是将当前节点设为父节点,在这里实现递归调用
p = XMLNode::ParseDeep( p, strPair );
return p;
}
XMLDocument就使用XMLNode的策略,因为他是一个父节点。
可以开始编写LoadFile的代码了,XMLError是个枚举类型,一般而言,先定义一个枚举,同时定义XML_SUCCESS=0,因为成功不需要理由,失败者总是需要借口。等到编码到后期再添加错误码。
XMLError XMLDocument::LoadFile( const char* filename )
{
Clear();
FILE* fp = callfopen( filename, "rb" );
if ( !fp ) {
SetError( XML_ERROR_FILE_NOT_FOUND, filename, 0 );
return _errorID;
}
LoadFile( fp );
fclose( fp );
return _errorID;
}
如果打开文件失败,返回XML_ERROR_FILE_NOT_FOUND错误。这时候可以在enum XMLError中添加错误码,以后为了不影响主要源码的阅读,不再细说。接着调用LoadFile
XMLError XMLDocument::LoadFile( FILE* fp )
{
Clear();
fseek( fp, 0, SEEK_SET );//如果文件内容为空,返回错误
if ( fgetc( fp ) == EOF && ferror( fp ) != 0 ) {
SetError( XML_ERROR_FILE_READ_ERROR, 0, 0 );
return _errorID;
}
fseek( fp, 0, SEEK_END );
const long filelength = ftell( fp );
fseek( fp, 0, SEEK_SET );//文件长度错误时
if ( filelength == -1L ) {
SetError( XML_ERROR_FILE_READ_ERROR, 0, 0 );
return _errorID;
}
TIXMLASSERT( filelength >= 0 );//检查size_t和unsigned long的字节个数是否相等
if ( !LongFitsIntoSizeTMinusOne<>::Fits( filelength ) ) {
// Cannot handle files which won't fit in buffer together with null terminator
SetError( XML_ERROR_FILE_READ_ERROR, 0, 0 );
return _errorID;
}
if ( filelength == 0 ) {
SetError( XML_ERROR_EMPTY_DOCUMENT, 0, 0 );
return _errorID;
}
const size_t size = filelength;
TIXMLASSERT( _charBuffer == 0 );
_charBuffer = new char[size+1];//知道了文件有多大,可以分配内存了
size_t read = fread( _charBuffer, 1, size, fp );
if ( read != size ) {
SetError( XML_ERROR_FILE_READ_ERROR, 0, 0 );
return _errorID;
}
_charBuffer[size] = 0;
Parse();
return _errorID;
}
开始解析文档,utf8 bom 自行搜索:
void XMLDocument::Parse()
{
TIXMLASSERT( NoChildren() ); // Clear() must have been called previously
TIXMLASSERT( _charBuffer );
char* p = _charBuffer;
p = XMLUtil::SkipWhiteSpace( p );
p = const_cast<char*>( XMLUtil::ReadBOM( p, &_writeBOM ) );
if ( !*p ) {
SetError( XML_ERROR_EMPTY_DOCUMENT, 0, 0 );
return;
}
ParseDeep(p, 0 );
}