markup.h + markup.cpp

最新推荐文章于 2023-11-16 16:02:59 发布

iteye_8601

最新推荐文章于 2023-11-16 16:02:59 发布

阅读量3.1k

点赞数

文章标签： c/c++

markup.h and markup.cpp in the http://www.firstobject.com/are the tools for the xml operatin in c++ language.

markup.h

// Markup.h: interface for the CMarkup class. // // Markup Release 6.1 Lite // Copyright (C) 1999-2001 First Objective Software, Inc. All rights reserved // This entire notice must be retained in this source code // Redistributing this source code requires written permission // This software is provided "as is", with no warranty. // Latest fixes enhancements and documentation at www.firstobject.com #if !defined(AFX_MARKUP_H__948A2705_9E68_11D2_A0BF_00105A27C570__INCLUDED_) #define AFX_MARKUP_H__948A2705_9E68_11D2_A0BF_00105A27C570__INCLUDED_ #if _MSC_VER > 1000 #pragma once #endif // _MSC_VER > 1000 #include <afxtempl.h> class CMarkup { public: CMarkup() { SetDoc( NULL ); }; CMarkup( LPCTSTR szDoc ) { SetDoc( szDoc ); }; CMarkup( const CMarkup& markup ) { *this = markup; }; void operator=( const CMarkup& markup ); virtual ~CMarkup() {}; // Navigate bool SetDoc( LPCTSTR szDoc ); bool IsWellFormed(); bool FindElem( LPCTSTR szName=NULL ); bool FindChildElem( LPCTSTR szName=NULL ); bool IntoElem(); bool OutOfElem(); void ResetChildPos() { m_iPosChild = 0; }; void ResetPos(); CString GetTagName() const { return x_GetTagName(m_iPos); }; CString GetChildTagName() const { return x_GetTagName(m_iPosChild); }; CString GetData() const { return x_GetData(m_iPos); }; CString GetChildData() const { return x_GetData(m_iPosChild); }; CString GetAttrib( LPCTSTR szAttrib ) const { return x_GetAttrib(m_iPos,szAttrib); }; CString GetChildAttrib( LPCTSTR szAttrib ) const { return x_GetAttrib(m_iPosChild,szAttrib); }; bool GetOffsets( int& nStart, int& nEnd ) const; CString GetError() const { return m_csError; }; // Create CString GetDoc() const { return m_csDoc; }; bool AddElem( LPCTSTR szName, LPCTSTR szData=NULL ); bool AddChildElem( LPCTSTR szName, LPCTSTR szData=NULL ); bool AddAttrib( LPCTSTR szAttrib, LPCTSTR szValue ); bool AddChildAttrib( LPCTSTR szAttrib, LPCTSTR szValue ); protected: CString m_csDoc; int m_nLevel; CString m_csError; struct ElemPos { ElemPos() { Clear(); }; ElemPos( const ElemPos& pos ) { *this = pos; }; bool IsEmptyElement() const { return (nStartR == nEndL + 1); }; void Clear() { nStartL=0; nStartR=0; nEndL=0; nEndR=0; nNext=0; iElemParent=0; iElemChild=0; iElemNext=0; }; int nStartL; int nStartR; int nEndL; int nEndR; int nNext; int iElemParent; int iElemChild; int iElemNext; }; CArray< ElemPos, ElemPos& > m_aPos; int m_iPos; int m_iPosChild; int m_iPosFree; int x_GetFreePos(); int x_ReleasePos(); struct TokenPos { TokenPos() { Clear(); }; bool IsValid() const { return (nL <= nR); }; void Clear() { nL=0; nR=-1; bIsString=false; }; int nL; int nR; int nNext; bool bIsString; }; int x_ParseElem( int iPos ); int x_ParseError( LPCTSTR szError, LPCTSTR szTag = NULL ); bool x_FindChar( int&n, _TCHAR c ) const; bool x_FindToken( TokenPos& token ) const; CString x_GetToken( const TokenPos& token ) const; CString x_GetTagName( int iPos ) const; CString x_GetData( int iPos ) const; CString x_GetAttrib( int iPos, LPCTSTR szAttrib ) const; int x_Add( int iPosParent, int iPosBefore, LPCTSTR szName, LPCTSTR szValue ); bool x_FindAttrib( TokenPos& token, LPCTSTR szAttrib=NULL ) const; int x_AddAttrib( int iPos, LPCTSTR szAttrib, LPCTSTR szValue ); int x_SetAttrib( int iPos, LPCTSTR szAttrib, LPCTSTR szValue ); bool x_SetData( int iPos, LPCTSTR szData, int nCDATA ); void x_DocChange( int nLeft, int nReplace, const CString& csInsert ); void x_PosInsert( int iPos, int nInsertLength ); void x_Adjust( int iPos, int nShift ); CString x_TextToDoc( LPCTSTR szText, bool bAttrib = false ) const; CString x_TextFromDoc( int nLeft, int nRight ) const; }; #endif // !defined(AFX_MARKUP_H__948A2705_9E68_11D2_A0BF_00105A27C570__INCLUDED_)

markup.cpp

// Markup.cpp: implementation of the CMarkup class. // // Markup Release 6.1 Lite // Copyright (C) 1999-2001 First Objective Software, Inc. All rights reserved // This entire notice must be retained in this source code // Redistributing this source code requires written permission // This software is provided "as is", with no warranty. // Latest fixes enhancements and documentation at www.firstobject.com #include "stdafx.h" #include "afxconv.h" #include "Markup.h" #ifdef _DEBUG #undef THIS_FILE static char THIS_FILE[]=__FILE__; #define new DEBUG_NEW #endif void CMarkup::operator=( const CMarkup& markup ) { m_iPos = markup.m_iPos; m_iPosChild = markup.m_iPosChild; m_iPosFree = markup.m_iPosFree; m_aPos.RemoveAll(); m_aPos.Append( markup.m_aPos ); m_nLevel = markup.m_nLevel; m_csDoc = markup.m_csDoc; } void CMarkup::ResetPos() { // Reset the main and child positions m_iPos = 0; m_iPosChild = 0; m_nLevel = 0; }; bool CMarkup::SetDoc( LPCTSTR szDoc ) { // Reset indexes m_iPosFree = 1; ResetPos(); // Set document text if ( szDoc ) m_csDoc = szDoc; else m_csDoc.Empty(); // Starting size of position array: 1 element per 64 bytes of document // Tight fit when parsing small doc, only 0 to 2 reallocs when parsing large doc // Start at 8 when creating new document int nStartSize = m_csDoc.GetLength() / 64 + 8; if ( m_aPos.GetSize() < nStartSize ) m_aPos.SetSize( nStartSize ); // Parse document bool bWellFormed = false; if ( m_csDoc.GetLength() ) { m_aPos[0].Clear(); int iPos = x_ParseElem( 0 ); if ( iPos > 0 ) { m_aPos[0].iElemChild = iPos; bWellFormed = true; } } // Clear indexes if parse failed or empty document if ( ! bWellFormed ) { m_aPos[0].Clear(); m_iPosFree = 1; } ResetPos(); return bWellFormed; }; bool CMarkup::IsWellFormed() { if ( m_aPos.GetSize() && m_aPos[0].iElemChild ) return TRUE; return FALSE; } bool CMarkup::FindElem( LPCTSTR szName ) { // If szName is NULL or empty, go to next sibling element // Otherwise go to next sibling element with matching tag name // If the current position is valid, start looking from next // Change current position only if found // int iPos = m_iPos; if ( ! iPos ) { if ( m_aPos.GetSize() ) iPos = m_aPos[0].iElemChild; } else { iPos = m_aPos[iPos].iElemNext; } while ( iPos ) { // Compare tag name unless szName is not specified if ( szName == NULL || !szName[0] || x_GetTagName(iPos) == szName ) { // Assign new position m_iPos = iPos; m_iPosChild = 0; return true; } iPos = m_aPos[iPos].iElemNext; } return false; } bool CMarkup::FindChildElem( LPCTSTR szName ) { // If szName is NULL or empty, go to next sibling child element // Otherwise go to next sibling child element with matching tag name // If the current child position is valid, start looking from next // Change current child position only if found // // Shorthand: call this with no current position means under root element if ( ! m_iPos ) FindElem(); // Is main position valid and not empty? if ( ! m_iPos || m_aPos[m_iPos].IsEmptyElement() ) return false; // Is current child position valid? int iPosChild = m_iPosChild; if ( iPosChild ) iPosChild = m_aPos[iPosChild].iElemNext; else iPosChild = m_aPos[m_iPos].iElemChild; // Search while ( iPosChild ) { // Compare tag name unless szName is not specified if ( szName == NULL || !szName[0] || x_GetTagName(iPosChild) == szName ) { // Assign new position m_iPosChild = iPosChild; return true; } iPosChild = m_aPos[iPosChild].iElemNext; } return false; } bool CMarkup::IntoElem() { // Find child element unless there is already a child element position if ( ! m_iPosChild ) FindChildElem(); if ( m_iPosChild ) { m_iPos = m_iPosChild; m_iPosChild = 0; ++m_nLevel; return true; } return false; } bool CMarkup::OutOfElem() { // Go to parent element if ( m_iPos && m_nLevel > 0 ) { m_iPosChild = m_iPos; m_iPos = m_aPos[m_iPos].iElemParent; --m_nLevel; return true; } return false; } bool CMarkup::GetOffsets( int& nStart, int& nEnd ) const { // Return document offsets of current main position element // This is not part of EDOM but is used by the Markup project if ( m_iPos ) { nStart = m_aPos[m_iPos].nStartL; nEnd = m_aPos[m_iPos].nEndR; return true; } return false; } bool CMarkup::AddElem( LPCTSTR szName, LPCTSTR szValue ) { // Add an element after current main position int iPosParent = m_iPos? m_aPos[m_iPos].iElemParent : 0; m_iPosChild = 0; // Setting root element? if ( iPosParent == 0 ) { if ( IsWellFormed() ) return false; m_csDoc.Empty(); } m_iPos = x_Add( iPosParent, m_iPos, szName, szValue ); return true; } bool CMarkup::AddChildElem( LPCTSTR szName, LPCTSTR szValue ) { // Add a child element under main position, after current child position if ( ! m_iPos ) return false; // If no child position, add after last sibling int iPosLast = m_aPos[m_iPos].iElemChild; if ( ! m_iPosChild && iPosLast ) { m_iPosChild = iPosLast; while ( (iPosLast=m_aPos[iPosLast].iElemNext) != 0 ) m_iPosChild = iPosLast; } m_iPosChild = x_Add( m_iPos, m_iPosChild, szName, szValue ); return true; } bool CMarkup::AddAttrib( LPCTSTR szAttrib, LPCTSTR szValue ) { // Add attribute to current main position element if ( m_iPos ) { x_AddAttrib( m_iPos, szAttrib, szValue ); return true; } return false; } bool CMarkup::AddChildAttrib( LPCTSTR szAttrib, LPCTSTR szValue ) { // Add attribute to current child position element if ( m_iPosChild ) { x_AddAttrib( m_iPosChild, szAttrib, szValue ); return true; } return false; } // // Private Methods // int CMarkup::x_GetFreePos() { // // This returns the index of the next unused ElemPos in the array // if ( m_iPosFree == m_aPos.GetSize() ) m_aPos.SetSize( m_iPosFree + m_iPosFree / 2 ); ++m_iPosFree; return m_iPosFree - 1; } int CMarkup::x_ReleasePos() { // // This decrements the index of the next unused ElemPos in the array // allowing the element index returned by GetFreePos() to be reused // --m_iPosFree; return 0; } int CMarkup::x_ParseError( LPCTSTR szError, LPCTSTR szTag ) { if ( szTag ) m_csError.Format( szError, szTag ); else m_csError = szError; x_ReleasePos(); return -1; } int CMarkup::x_ParseElem( int iPosParent ) { // This is either called by SetDoc, x_AddSubDoc, or itself recursively // m_aPos[iPosParent].nEndL is where to start parsing for the child element // This returns the new position if a tag is found, otherwise zero // In all cases we need to get a new ElemPos, but release it if unused // int iPos = x_GetFreePos(); m_aPos[iPos].nStartL = m_aPos[iPosParent].nEndL; m_aPos[iPos].nNext = m_aPos[iPosParent].nStartR + 1; m_aPos[iPos].iElemParent = iPosParent; m_aPos[iPos].iElemChild = 0; m_aPos[iPos].iElemNext = 0; // Start Tag // A loop is used to ignore all remarks tags and special tags // i.e. <?xml version="1.0"?>, and  // So any tag beginning with ? or ! is ignored // Loop past ignored tags TokenPos token; token.nNext = m_aPos[iPosParent].nEndL; CString csName; while ( csName.IsEmpty() ) { // Look for left angle bracket of start tag m_aPos[iPos].nStartL = token.nNext; if ( ! x_FindChar( m_aPos[iPos].nStartL, _T('<') ) ) return x_ParseError( _T("Element tag not found") ); // Set parent's End tag to start looking from here (or later) m_aPos[iPosParent].nEndL = m_aPos[iPos].nStartL; // Determine whether this is an element, comment or version tag LPCTSTR szEndOfTag = NULL; token.nNext = m_aPos[iPos].nStartL + 1; if ( x_FindToken( token ) ) { if ( token.bIsString ) return x_ParseError( _T("Tag starts with quote") ); TCHAR cFirstChar = m_csDoc[token.nL]; if ( cFirstChar == _T('?') ) szEndOfTag = _T("?>"); // version else if ( cFirstChar == _T('!') ) { TCHAR cSecondChar = 0; if ( token.nL+1 < m_csDoc.GetLength() ) cSecondChar = m_csDoc[token.nL+1]; if ( cSecondChar == _T('[') ) szEndOfTag = _T("]]>"); // CDATA section else if ( cSecondChar == _T('-') ) szEndOfTag = _T("-->"); // comment else szEndOfTag = _T(">"); // DTD } else if ( cFirstChar != _T('/') ) { csName = x_GetToken( token ); szEndOfTag = _T(">"); } else return x_ReleasePos(); // probably end tag of parent } else return x_ParseError( _T("Abrupt end within tag") ); // Look for end of tag token.nNext = m_csDoc.Find( szEndOfTag, token.nNext ); if ( token.nNext == -1 ) return x_ParseError( _T("End of tag not found") ); } m_aPos[iPos].nStartR = token.nNext; // Is ending mark within start tag, i.e. empty element? if ( m_csDoc[m_aPos[iPos].nStartR-1] == _T('/') ) { // Empty element // Close tag left is set to ending mark, and right to open tag right m_aPos[iPos].nEndL = m_aPos[iPos].nStartR-1; m_aPos[iPos].nEndR = m_aPos[iPos].nStartR; } else // look for end tag { // Element probably has contents // Determine where to start looking for left angle bracket of end tag // This is done by recursively parsing the contents of this element int iInner, iInnerPrev = 0; m_aPos[iPos].nEndL = m_aPos[iPos].nStartR + 1; while ( (iInner = x_ParseElem( iPos )) > 0 ) { // Set links to iInner if ( iInnerPrev ) m_aPos[iInnerPrev].iElemNext = iInner; else m_aPos[iPos].iElemChild = iInner; iInnerPrev = iInner; // Set offset to reflect child m_aPos[iPos].nEndL = m_aPos[iInner].nEndR + 1; } if ( iInner == -1 ) return -1; // Look for left angle bracket of end tag if ( ! x_FindChar( m_aPos[iPos].nEndL, _T('<') ) ) return x_ParseError( _T("End tag of %s element not found"), csName ); // Look through tokens of end tag token.nNext = m_aPos[iPos].nEndL + 1; int nTokenCount = 0; while ( x_FindToken( token ) ) { ++nTokenCount; if ( ! token.bIsString ) { // Is first token not an end slash mark? if ( nTokenCount == 1 && m_csDoc[token.nL] != _T('/') ) return x_ParseError( _T("Expecting end tag of element %s"), csName ); else if ( nTokenCount == 2 && csName != x_GetToken( token ) ) return x_ParseError( _T("End tag does not correspond to %s"), csName ); // Else is it a right angle bracket? else if ( m_csDoc[token.nL] == _T('>') ) break; } } // Was a right angle bracket not found? if ( ! token.IsValid() || nTokenCount < 2 ) return x_ParseError( _T("End tag not completed for element %s"), csName ); m_aPos[iPos].nEndR = token.nL; } // Successfully found positions of angle brackets m_aPos[iPos].nNext = m_aPos[iPos].nEndR; x_FindChar( m_aPos[iPos].nNext, _T('<') ); return iPos; } bool CMarkup::x_FindChar( int&n, _TCHAR c ) const { // Look for char c starting at n, and set n to point to it // c is always the first char of a multi-byte char // Return false if not found before end of document LPCTSTR szDoc = (LPCTSTR)m_csDoc; while ( szDoc[n] && szDoc[n] != c ) n += _tclen( &szDoc[n] ); if ( ! szDoc[n] ) return false; return true; } bool CMarkup::x_FindToken( CMarkup::TokenPos& token ) const { // Starting at token.nNext, find the next token // upon successful return, token.nNext points after the retrieved token LPCTSTR szDoc = (LPCTSTR)m_csDoc; int n = token.nNext; // Statically defined CStrings for whitespace and special chars static CString csWhitespace = _T(" /t/n/r"); static CString csSpecial = _T("<>=///?!"); // By-pass leading whitespace while ( szDoc[n] && csWhitespace.Find(szDoc[n]) > -1 ) ++n; // Are we still within the document? token.bIsString = false; if ( szDoc[n] ) { // Is it an opening quote? if ( szDoc[n] == _T('/"') ) { // Move past opening quote ++n; token.nL = n; // Look for closing quote x_FindChar( n, _T('/"') ); // Set right to before closing quote token.nR = n-1; // Set n past closing quote unless at end of document if ( szDoc[n] ) ++n; // Set flag token.bIsString = true; } else { // Go until special char or whitespace token.nL = n; while ( szDoc[n] && csSpecial.Find(m_csDoc[n]) == -1 && csWhitespace.Find(m_csDoc[n]) == -1 ) n += _tclen(&szDoc[n]); // Adjust end position if it is one special char if ( n == token.nL ) ++n; // it is a special char token.nR = n-1; } } token.nNext = n; if ( ! szDoc[n] ) return false; // nNext points to one past last char of token return true; } CString CMarkup::x_GetToken( const CMarkup::TokenPos& token ) const { // The token contains indexes into the document identifying a small substring // Build the substring from those indexes and return it if ( ! token.IsValid() ) return _T(""); return m_csDoc.Mid( token.nL, token.nR - token.nL + ((token.nR<m_csDoc.GetLength())? 1:0) ); } CString CMarkup::x_GetTagName( int iPos ) const { // Return the tag name at specified element TokenPos token; token.nNext = m_aPos[iPos].nStartL + 1; if ( ! iPos || ! x_FindToken( token ) ) return _T(""); // Return substring of document return x_GetToken( token ); } bool CMarkup::x_FindAttrib( CMarkup::TokenPos& token, LPCTSTR szAttrib ) const { // If szAttrib is NULL find next attrib, otherwise find named attrib // Return true if found int nAttrib = 0; for ( int nCount = 0; x_FindToken(token); ++nCount ) { if ( ! token.bIsString ) { // Is it the right angle bracket? if ( m_csDoc[token.nL] == _T('>') || m_csDoc[token.nL] == _T('/') ) break; // attrib not found // Equal sign if ( m_csDoc[token.nL] == _T('=') ) continue; // Potential attribute if ( ! nAttrib && nCount ) { // Attribute name search? if ( ! szAttrib || ! szAttrib[0] ) return true; // return with token at attrib name // Compare szAttrib if ( x_GetToken(token) == szAttrib ) nAttrib = nCount; } } else if ( nAttrib && nCount == nAttrib + 2 ) { return true; } } // Not found return false; } CString CMarkup::x_GetAttrib( int iPos, LPCTSTR szAttrib ) const { // Return the value of the attrib at specified element TokenPos token; token.nNext = m_aPos[iPos].nStartL + 1; if ( szAttrib && x_FindAttrib( token, szAttrib ) ) return x_TextFromDoc( token.nL, token.nR - ((token.nR<m_csDoc.GetLength())?0:1) ); return _T(""); } CString CMarkup::x_GetData( int iPos ) const { // Return a string representing data between start and end tag // Return empty string if there are any children elements if ( ! m_aPos[iPos].iElemChild && ! m_aPos[iPos].IsEmptyElement() ) { // See if it is a CDATA section TokenPos token; token.nNext = m_aPos[iPos].nStartR+1; if ( x_FindToken( token ) && m_csDoc[token.nL] == _T('<') && token.nL + 11 < m_aPos[iPos].nEndL && _tcsncmp( &((LPCTSTR)m_csDoc)[token.nL+1], _T("![CDATA["), 8 ) == 0 ) { int nEndCDATA = m_csDoc.Find( _T("]]>"), token.nNext ); if ( nEndCDATA != -1 && nEndCDATA < m_aPos[iPos].nEndL ) { return m_csDoc.Mid( token.nL+9, nEndCDATA-token.nL-9 ); } } return x_TextFromDoc( m_aPos[iPos].nStartR+1, m_aPos[iPos].nEndL-1 ); } return ""; } CString CMarkup::x_TextToDoc( LPCTSTR szText, bool bAttrib ) const { // Convert text as seen outside XML document to XML friendly // replacing special characters with ampersand escape codes // E.g. convert "6>7" to "6>7" // // < less than // & ampersand // > greater than // // and for attributes: // // ' apostrophe or single quote // " double quote // static _TCHAR* szaReplace[] = { _T("<"),_T("&"),_T(">"),_T("'"),_T(""") }; const _TCHAR* pFind = bAttrib?_T("<&>/'/""):_T("<&>"); CString csText; const _TCHAR* pSource = szText; int nDestSize = _tcslen(pSource); nDestSize += nDestSize / 10 + 7; _TCHAR* pDest = csText.GetBuffer(nDestSize); int nLen = 0; _TCHAR cSource = *pSource; _TCHAR* pFound; while ( cSource ) { if ( nLen > nDestSize - 6 ) { csText.ReleaseBuffer(nLen); nDestSize *= 2; pDest = csText.GetBuffer(nDestSize); } if ( (pFound=_tcschr(pFind,cSource)) != NULL ) { pFound = szaReplace[pFound-pFind]; _tcscpy(&pDest[nLen],pFound); nLen += _tcslen(pFound); } else { _tccpy( &pDest[nLen], pSource ); ++nLen; } pSource += _tclen( pSource ); cSource = *pSource; } csText.ReleaseBuffer(nLen); return csText; } CString CMarkup::x_TextFromDoc( int nLeft, int nRight ) const { // Convert XML friendly text to text as seen outside XML document // replacing ampersand escape codes with special characters // E.g. convert "6>7" to "6>7" // // Conveniently the result is always the same or shorter in length // static _TCHAR* szaCode[] = { _T("lt;"),_T("amp;"),_T("gt;"),_T("apos;"),_T("quot;") }; static int anCodeLen[] = { 3,4,3,5,5 }; static _TCHAR* szSymbol = _T("<&>/'/""); CString csText; const _TCHAR* pSource = m_csDoc; int nDestSize = nRight - nLeft + 1; _TCHAR* pDest = csText.GetBuffer(nDestSize); int nLen = 0; int nCharLen; int nChar = nLeft; while ( nChar <= nRight ) { if ( pSource[nChar] == _T('&') ) { // Look for matching &code; for ( int nMatch = 0; nMatch < 5; ++nMatch ) { if ( nChar <= nRight - anCodeLen[nMatch] && _tcsncmp(szaCode[nMatch],&pSource[nChar+1],anCodeLen[nMatch]) == 0 ) { pDest[nLen++] = szSymbol[nMatch]; nChar += anCodeLen[nMatch] + 1; break; } } // If no match is found it means XML doc is invalid // no devastating harm done, ampersand code will just be left in result if ( nMatch == 5 ) { pDest[nLen++] = _T('&'); ++nChar; } } else { nCharLen = _tclen(&pSource[nChar]); _tccpy( &pDest[nLen], &pSource[nChar] ); nLen += nCharLen; nChar += nCharLen; } } csText.ReleaseBuffer(nLen); return csText; } void CMarkup::x_DocChange( int nLeft, int nReplace, const CString& csInsert ) { // Insert csInsert int m_csDoc at nLeft replacing nReplace chars // Do this with only one buffer reallocation if it grows // int nDocLength = m_csDoc.GetLength(); int nInsLength = csInsert.GetLength(); // Make sure nLeft and nReplace are within bounds nLeft = max( 0, min( nLeft, nDocLength ) ); nReplace = max( 0, min( nReplace, nDocLength-nLeft ) ); // Get pointer to buffer with enough room int nNewLength = nInsLength + nDocLength - nReplace; int nBufferLen = nNewLength; _TCHAR* pDoc = m_csDoc.GetBuffer( nBufferLen ); // Move part of old doc that goes after insert if ( nLeft+nReplace < nDocLength ) memmove( &pDoc[nLeft+nInsLength], &pDoc[nLeft+nReplace], (nDocLength-nLeft-nReplace)*sizeof(_TCHAR) ); // Copy insert memcpy( &pDoc[nLeft], csInsert, nInsLength*sizeof(_TCHAR) ); // Release m_csDoc.ReleaseBuffer( nNewLength ); } void CMarkup::x_Adjust( int iPos, int nShift ) { // Loop through affected elements and adjust indexes // Does not affect iPos itself // Algorithm: // 1. update next siblings and all their children // 2. then go up a level update end points and to step 1 int iPosTop = m_aPos[iPos].iElemParent; while ( iPos ) { // Were we at containing parent of affected position? bool bPosTop = false; if ( iPos == iPosTop ) { // Move iPosTop up one towards root iPosTop = m_aPos[iPos].iElemParent; bPosTop = true; } // Traverse to the next update position if ( ! bPosTop && m_aPos[iPos].iElemChild ) { // Depth first iPos = m_aPos[iPos].iElemChild; } else if ( m_aPos[iPos].iElemNext ) { iPos = m_aPos[iPos].iElemNext; } else { // Look for next sibling of a parent of iPos // When going back up, parents have already been done except iPosTop while ( (iPos=m_aPos[iPos].iElemParent) != 0 && iPos != iPosTop ) if ( m_aPos[iPos].iElemNext ) { iPos = m_aPos[iPos].iElemNext; break; } } // Shift indexes at iPos if ( iPos != iPosTop ) { // Move the start tag indexes // Don't do this for containing parent tag m_aPos[iPos].nStartL += nShift; m_aPos[iPos].nStartR += nShift; } // Move end tag indexes m_aPos[iPos].nEndL += nShift; m_aPos[iPos].nEndR += nShift; m_aPos[iPos].nNext += nShift; } } int CMarkup::x_Add( int iPosParent, int iPosBefore, LPCTSTR szName, LPCTSTR szValue ) { // Create element and modify positions of affected elements // if iPosBefore is NULL, insert as first element under parent // If no szValue is specified, an empty element is created // i.e. either <NAME>value</NAME> or <NAME/> // int iPos = x_GetFreePos(); bool bEmptyParent = false; if ( iPosBefore ) { // Follow iPosBefore m_aPos[iPos].nStartL = m_aPos[iPosBefore].nNext; } else if ( m_aPos[iPosParent].iElemChild ) { // Insert before first child of parent m_aPos[iPos].nStartL = m_aPos[m_aPos[iPosParent].iElemChild].nStartL; } else if ( m_aPos[iPosParent].IsEmptyElement() ) { // Parent has no separate end tag m_aPos[iPos].nStartL = m_aPos[iPosParent].nStartR + 2; bEmptyParent = true; } else { // Parent has content, but no children m_aPos[iPos].nStartL = m_aPos[iPosParent].nEndL; } // Set links m_aPos[iPos].iElemParent = iPosParent; m_aPos[iPos].iElemChild = 0; if ( iPosBefore ) { m_aPos[iPos].iElemNext = m_aPos[iPosBefore].iElemNext; m_aPos[iPosBefore].iElemNext = iPos; } else { m_aPos[iPos].iElemNext = m_aPos[iPosParent].iElemChild; m_aPos[iPosParent].iElemChild = iPos; } // Create string for insert CString csInsert; int nLenName = _tcslen(szName); int nLenValue = szValue? _tcslen(szValue) : 0; if ( ! nLenValue ) { // <NAME/> empty element csInsert.Format( _T("

This article has been re-written with the help of 2 years of feedback, and the new source code has benefited from all of the fixes and developments during that time period. See release notes below.

Introduction

Often times you don't want to invest in learning a complex XML tool to implement a little bit of XML processing in your application. Its SO Easy! Just add Markup.cpp and Markup.h to your Visual C++ MFC project, #include "Markup.h", and begin using it. There are no other dependencies.

Features

Light: one small class that maintains one single document string with a simple array of indexes
Fast: the parser builds the index array in one quick pass
Simple: EDOM methods make it ridiculously easy to create or process XML strings
Independent: compiles into your program without requiring MSXML or any tokenizer
UNICODE: can be compiled for UNICODE for Windows CE and NT/XP platforms (define _UNICODE)
UTF-8: when not in UNICODE or MBCS builds, it works with UTF-8, ASCII, or Windows extended sets
MBCS: can be compiled for Windows double-byte character sets such as Chinese GB2312 (define _MBCS)

XML for Everyday Data

We often need to store and/or pass information in a file, or send a block of information from computer A to computer B. And the issue is always the same: How shall I format this data? Before XML, you might have considered "env" style e.g. PATH=C:/WIN95; "ini" style (grouped in sections); comma-delimited or otherwise delimited; or fixed character lengths. XML is now the established answer to that question except that programmers are sometimes discouraged by the size and complexity of XML solutions when all they need is something convenient to help parse and format angle brackets. For good minimalist reading on the syntax rules for XML tags, I recommend Beginning XML - Chapter 2: Well-Formed XML posted here on the Code Project.

XML is better because of its flexible and hierarchical nature, plus its wide acceptance. Although XML uses more characters than delimited formats, it compresses down well if needed. The flexibility of XML becomes apparent when you want to expand the types of information your document can contain without requiring every consumer of the information to rewrite processing logic. You can keep the old information identified and ordered the same way it was while adding new attributes and elements.

CMarkup Lite Methods

CMarkup is based on the "Encapsulated" Document Object Model (EDOM), the key to simple XML processing. Its a set of methods for XML processing with the same general purpose as DOM (Document Object Model). But while DOM has numerous types of objects, EDOM defines only one object, the XML document. EDOM harks back to the original attraction of XML which was its simplicity. To keep overhead low, CMarkup takes a very light non-conforming non-validating approach to XML, and it does not verify the XML is well-formed.

The CMarkup "Lite" in this article is the free version of the CMarkup product sold at firstobject.com. CMarkup Lite implements a subset of EDOM methods for creating and parsing XML document strings. The Lite methods also encompass some modification functionality such as setting an attribute or adding additional elements to an existing XML document, but not changing the data of, or removing, XML elements. See the EDOM specification to compare the full CMarkup with CMarkup Lite. The full CMarkup is available in Evaluation (Educational) and licensed Developer versions with many more methods, STL and MSXML versions, Base64, and additional documentation. But this Lite version here at Code Project is more than adequate for parsing and creating simple XML strings in MFC.

The CMarkup Lite methods are grouped into Creation and Navigation categories listed below.

CMarkup Lite Creation Methods

Collapse

Copy Code

CString GetDoc() const { return m_csDoc; };
bool AddElem( LPCTSTR szName, LPCTSTR szData=NULL );
bool AddChildElem( LPCTSTR szName, LPCTSTR szData=NULL );
bool AddAttrib( LPCTSTR szAttrib, LPCTSTR szValue );
bool AddChildAttrib( LPCTSTR szAttrib, LPCTSTR szValue );
bool SetAttrib( LPCTSTR szAttrib, LPCTSTR szValue );
bool SetChildAttrib( LPCTSTR szAttrib, LPCTSTR szValue );

GetDoc is used to get the document string after adding elements and setting attributes. The AddAttrib and SetAttrib methods do the same thing as each other (as do AddChildAttrib and SetChildAttrib). They will change the attribute's value if it already exists, and add the attribute if it doesn't.

CMarkup Lite Navigation Methods

Collapse

Copy Code

bool SetDoc( LPCTSTR szDoc );
bool IsWellFormed();
bool FindElem( LPCTSTR szName=NULL );
bool FindChildElem( LPCTSTR szName=NULL );
bool IntoElem();
bool OutOfElem();
void ResetChildPos();
void ResetMainPos();
void ResetPos();
CString GetTagName() const;
CString GetChildTagName() const;
CString GetData() const;
CString GetChildData() const;
CString GetAttrib( LPCTSTR szAttrib ) const;
CString GetChildAttrib( LPCTSTR szAttrib ) const;
CString GetError() const;

When you call SetDoc it parses the szDoc string and populates the CMarkup object. If it fails, it returns false, and you can call GetError for an error description. The IsWellFormed method returns true if the CMarkup object has at least a root element; it does not verify well-formedness.

Using CMarkup

The CMarkup class encapsulates the XML document text, structure, and current positions. It has methods both to add elements and to navigate and get element attributes and data. The locations in the document where operations are performed are governed by the current position and the current child position. This current positioning allows you to work with the XML document without instantiating additional objects that point into the document. At all times, the object maintains a string representing the text of the document which can be retrieved using GetDoc.

Check out the free firstobject XML editor which generates C++ source code for creating and navigating your own XML documents with CMarkup Lite.

Creating an XML Document

To create an XML document, instantiate a CMarkup object and call AddElem to create the root element. At this point, if you called AddElem("ORDER") your document would simply contain the empty ORDER element <ORDER/>. Then call AddChildElem to create elements under the root element (i.e. "inside" the root element, hierarchically speaking). The following example code creates an XML document and retrieves it into a CString:

Collapse

Copy Code

CMarkup xml;
xml.AddElem( "ORDER" );
xml.AddChildElem( "ITEM" );
xml.IntoElem();
xml.AddChildElem( "SN", "132487A-J" );
xml.AddChildElem( "NAME", "crank casing" );
xml.AddChildElem( "QTY", "1" );
CString csXML = xml.GetDoc();

This code generates the following XML. The root is the ORDER element; notice that its start tag <ORDER> is at the beginning and end tag </ORDER> is at the bottom. When an element is under (i.e. inside or contained by) a parent element, the parent's start tag is before it and the parent's end tag is after it. The ORDER element contains one ITEM element. That ITEM element contains 3 child elements: SN, NAME, and QTY.

Collapse

Copy Code

<ORDER>
<ITEM>
<SN>132487A-J</SN>
<NAME>crank casing</NAME>
<QTY>1</QTY>
</ITEM>
</ORDER>

As shown in the example, you can create elements under a child element by calling IntoElem to move your current main position to where the current child position is so you can begin adding under what was the child element. CMarkup maintains a current position in order to keep your source code shorter and simpler. This same position logic is used when navigating a document.

Navigating an XML Document

The XML string created in the above example can be parsed into a CMarkup object with the SetDoc method. You can also navigate it right inside the same CMarkup object where it was created; just call ResetPos if you want to reset the current position back to the beginning of the document.

In the following example, after populating the CMarkup object from the csDoc string, we loop through all ITEM elements under the ORDER element and get the serial number and quantity of each item:

Collapse

Copy Code

CMarkup xml;
xml.SetDoc( csXML );
while ( xml.FindChildElem("ITEM") )
{
    xml.IntoElem();
    xml.FindChildElem( "SN" );
    CString csSN = xml.GetChildData();
    xml.FindChildElem( "QTY" );
    int nQty = atoi( xml.GetChildData() );
    xml.OutOfElem();
}

For each item we find, we call IntoElem before interrogating its child elements, and then OutOfElem afterwards. As you get accustomed to this type of navigation you will know to check in your loops to make sure there is a corresponding OutOfElem call for every IntoElem call.

Adding Elements and Attributes

The above example for creating a document only created one ITEM element. Here is an example that creates multiple items loaded from a previously populated data source, plus a SHIPMENT information element in which one of the elements has an attribute. This code also demonstrates that instead of calling AddChildElem, you can call IntoElem and AddElem. It means more calls, but some people find this more intuitive.

Collapse

Copy Code

CMarkup xml;
xml.AddElem( "ORDER" );
xml.IntoElem(); // inside ORDER
for ( int nItem=0; nItem<aItems.GetSize(); ++nItem )
{
    xml.AddElem( "ITEM" );
    xml.IntoElem(); // inside ITEM
    xml.AddElem( "SN", aItems[nItem].csSN );
    xml.AddElem( "NAME", aItems[nItem].csName );
    xml.AddElem( "QTY", aItems[nItem].nQty );
    xml.OutOfElem(); // back out to ITEM level
}
xml.AddElem( "SHIPMENT" );
xml.IntoElem(); // inside SHIPMENT
xml.AddElem( "POC" );
xml.SetAttrib( "type", csPOCType );
xml.IntoElem(); // inside POC
xml.AddElem( "NAME", csPOCName );
xml.AddElem( "TEL", csPOCTel );

This code generates the following XML. The root ORDER element contains 2 ITEM elements and a SHIPMENT element. The ITEM elements both contain SN, NAME and QTY elements. The SHIPMENT element contains a POC element which has a type attribute, and NAME and TEL child elements.

Collapse

Copy Code

<ORDER>
<ITEM>
<SN>132487A-J</SN>
<NAME>crank casing</NAME>
<QTY>1</QTY>
</ITEM>
<ITEM>
<SN>4238764-A</SN>
<NAME>bearing</NAME>
<QTY>15</QTY>
</ITEM>
<SHIPMENT>
<POC type="non-emergency">
<NAME>John Smith</NAME>
<TEL>555-1234</TEL>
</POC>
</SHIPMENT>
</ORDER>

Finding Elements

The FindElem and FindChildElem methods go to the next sibling element. If the optional tag name argument is specified, then they go to the next element with a matching tag name. The element that is found becomes the current element, and the next call to Find will go to the next sibling or matching sibling after that current position.

When you cannot assume the order of the elements, you must reset the position in between calling the Find method. Looking at the ITEM element in the above example, if someone else is creating the XML and you cannot assume the SN element is before the QTY element, then call ResetChildPos() before finding the QTY element.

To find the item with a particular serial number, you can loop through the ITEM elements and compare the SN element data to the serial number you are searching for. This example differs from the original navigation example by calling IntoElem to go into the ORDER element and use FindElem("ITEM") instead of FindChildElem("ITEM"); either way is fine. And notice that by specifying the "ITEM" element tag name in the Find method we ignore all other sibling elements such as the SHIPMENT element.

Collapse

Copy Code

CMarkup xml;
xml.SetDoc( csXML );
xml.FindElem(); // ORDER element is root
xml.IntoElem(); // inside ORDER
while ( xml.FindElem("ITEM") )
{
    xml.FindChildElem( "SN" );
    if ( xml.GetChildData() == csFindSN )
        break; // found
}

Encodings

ASCII refers to the character codes under 128 that we have come to depend on, programming in English. Conveniently if you are only using ASCII, UTF-8 encoding is the same as your common ASCII set.

If you are using a character set not corresponding to one of the Unicode sets UTF-8, UTF-16 or UCS-2, you really should declare it in your XML declaration for the sake of interoperability and viewing it properly in Internet Explorer. Character sets like ISO-8859-1 (Western European) assign characters to the values in a byte between 128 and 255, so that every character still only uses one byte. Windows double-byte character sets such as GB2312, Shift_JIS and EUC-KR use one or two bytes per character. For these Windows charsets, put _MBCS in your preprocessor definitions and make sure your user's Operating System is set to the corresponding code page.

To prefix your XML document with an XML declaration such as <?xml version="1.0" encoding="ISO-8859-1"?>, pass it to SetDoc or the CMarkup constructor. Include a CRLF at the end as shown so that the root element goes on the next line.

Collapse

Copy Code

xml.SetDoc( "<?xml version=/"1.0/" encoding=/"ISO-8859-1/"?>/r/n" );
xml.AddElem( "island", "Cura�ao" );

Depth First Traversal

You can use the following code to loop through every element in your XML document. In the part of the code where you process the element, every element in the document (except the root element) will be encountered in depth first order. For illustrative purposes, it gets the tag name of the element. If you were searching for a particular element tag name you could break out of the loop at this point. "Depth first" means that it traverses all of an element's children before going to its sibling.

Collapse

Copy Code

BOOL bFinished = FALSE;
xml.ResetPos();
if ( ! xml.FindChildElem() )
    bFinished = TRUE;
while ( ! bFinished )
{
    // Process element
    xml.IntoElem();
    CString csTag = xml.GetTagName();

    // Next element (depth first)
    BOOL bFound = xml.FindChildElem();
    while ( ! bFound && ! bFinished )
    {
        if ( xml.OutOfElem() )
            bFound = xml.FindChildElem();
        else
            bFinished = TRUE;
    }
}

Loading and Saving Files

CMarkup Lite does not have Load and Save methods. To load a file, look in the CMarkupDlg::OnButtonParse method which loads a file into a string. Once you have it in a string, you can put it into the CMarkup object using SetDoc. To save it to a file, call GetDoc to get the string and then implement your own code to write the string to your file. When you need to implement any of your own project specific I/O error handling, streaming, permissions/locking, and charset conversion, it is actually good software design to keep this outside of the CMarkup class allowing CMarkup to remain a generic class.

The Test Dialog

The Markup.exe test bed for CMarkup is a Visual Studio 6.0 MFC project (also compiles in VS .NET too). When the dialog starts, it performs diagnostics in the RunTest function to test CMarkup in the context of the particular build options that have been selected. You can step through the RunTest function to see a lot of examples of how to use CMarkup. Use the Open and Parse button in the dialog to test a file.

In the following illustration, the Build Version is shown as "CMarkup Lite 6.5 Debug Unicode." This means that it is the debug version built with _UNICODE defined. The RunTest completed successfully. A parse error was encountered in the order_e.xml file. It also shows the load and parse times, and file size.

The Test Dialog keeps track of the last file parsed and the dialog screen position for convenience. This is kept in the registry under HKEY_CURRENT_USER/ Software/ First Objective Software/ Markup/ Settings.

How CMarkup Works

The CMarkup strategy is to leave the data in the document string and maintain a hierarchical arrangement of indexes mapping out the document.

increase speed: parse in one pass and maintain hierarchy of indexes
reduce overhead: do not copy or break up the text of the document

CMarkup parses the 250k play.xml sample document in about 40 milliseconds (1/25th of a second) on a 500Mhz machine, holding it as a single string, and allocating about 200k for a map of the 6343 elements. From then on, navigation does not require any parsing. As a rule of thumb, the map of indexes takes up approximately the same amount of memory as the document, so the memory footprint of the CMarkup object should settle down around 2 times the size of the document. For each element in the document a struct of eight integers (32 bytes) is maintained.

Collapse

Copy Code

int nStartL;
int nStartR;
int nEndL;
int nEndR;
int nReserved;
int iElemParent;
int iElemChild;
int iElemNext;

Look at the start and end tags in <QTY>1</QTY>. The struct contains the offsets of the left and right of both the start and end tags (i.e. all the < and > signs). The reserved integer is not currently used but could be used for a delete flag and/or level (i.e. depth) in the hierarchy to support indentation. The other three integers are indexes to the structs for the parent, child and next elements.

When the document is first parsed an array of these structs is built, and then as elements are modified and inserted in the XML, the structs are modified and added. Rather than allocating structs individually, they are allocated in an array using a "grow-by" mechanism to reduce the number of allocations to a handful. That is why integer array indexes rather than pointers are used for the links. Once an element is assigned an index in the array, that index does not change. So the index can be used as a way of referring to and locating an element

Release Notes

This release 6.5 of CMarkup Lite's public methods are backwards compatible with the previous release 6.1 posted here in August 2001 except for one rare usage of IntoElem. In 6.1, if you called IntoElem without a current child element, it would find the first child element. Now in 6.5 when there is no current child position, IntoElem puts the main position before the first child element so that a subsequent call to FindElem will not bypass the first element. So, the quick way to check this when upgrading is to scan all occurrences of IntoElem and make sure the previous CMarkup navigation call is FindChildElem before it. Or, if the child element was just created with AddChildElem then its okay because that sets the current child position too. For full details on this, see the IntoElem Changes in Release 6.3.

Other major changes since 6.1:

Fix: MBCS double-byte text x_TextToDoc *thanks knight_zhuge
Performance: parsing is roughly twice as fast
Debugging: see m_pMainDS and m_pChildDS class members while debugging to see string pointers showing current main and child positions
New Test Dialog interface with diagnostic results and load vs. parse times, and RunTest code for startup

License

CMarkup Lite is free for compiling into your commercial, personal and educational applications. Modify it as much as you like, but retain the copyright notice in the source code remarks. Redistribution of the modified or unmodified CMarkup Lite class source code is limited to your own development team and it cannot be made publicly available or distributable as part of any source code library or product, even if that offering is free. For source code products that derive from or utilize CMarkup Lite, please refer users to this article to obtain the source files for themselves. You are encouraged to discuss this source code and share enhancements here in the discussion board under this article. Enjoy!

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

About the Author

Ben Bryant

Member

Raised in Southern Ontario Canada. Bachelor of Science from the University of Toronto in Computer Science and Anthropology. Living near Washington D.C. in Virginia, USA.

Location:

United States