使用mshtml解析html

测试用例

 

ExpandedBlockStart.gif 测试用例
< html >
< head >
< title >
    Just a Test
</ title >
</ head >
< body >
gaofeng hello!!
< div >
< table  bgcolor ="red" >

< tr >
< td  bgcolor ="yellow"  border ="2" > Name </ td >
< td  id ="qualify1"  border ="1"  class ="blueBorder"  bgcolor =blue ></ td >
</ tr >
< tr >
< td >< id ="qualify2"  class ="blueBorder"  bgcolor ="blue"  border ="1" > Surname </ p ></ td >
< td ></ td >
</ tr >
< tr >
< td > address </ td >
< td ></ td >
</ tr >
</ table >
</ div >
</ body >
</ html >

 

 

头文件

 

#include  < iostream >
#include 
< comdef.h >
#include 
< mshtml.h >
#include 
< string >
#include 
< fstream >
#include 
< vector >
#include 
< map >
#import 
< mshtml.tlb >  no_auto_exclude

 

 

测试代码

 

ExpandedBlockStart.gif 代码
//  TestMSHTML.cpp : 定义控制台应用程序的入口点。
//

#include 
" stdafx.h "
#include 
" TestMSHTML.h "
#ifdef _DEBUG
#define  new DEBUG_NEW
#endif


//  唯一的应用程序对象

CWinApp theApp;
FILE 
*  fout;
using   namespace  std;
// OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");

typedef  
int  BorderAttribute;
void  FindAllElementHavingBg(IHTMLDocument2  *  pNewDoc,map < BorderAttribute,IHTMLElement  *>&  borderValue2ElementMap)
{
    IHTMLElement 
*  pBody;
    pNewDoc
-> get_body( & pBody);
    pBody
-> Release();
}

void  PrintTabs( int  n)
{
    
for  ( int  i  =   0 ;i < n;i ++ )
    {
        
// cout << '\t';
        fwprintf(fout,_T( " \t " ));
    }
}

void  VisitNode(IHTMLElement *  pElement, int  level)
{
    BSTR strName,strId,strTag;
    PrintTabs(level);
    pElement
-> get_className( & strName);
    pElement
-> get_id( & strId);
    pElement
-> get_tagName( & strTag);
    
if  (strTag != NULL)
    {
        fwprintf(fout,_T(
" TagName:%s  " ),strTag);
    }
    
if  (strName != NULL)
    {
        fwprintf(fout,_T(
" className:%s  " ),strName);
    }
    
if  (strId  !=  NULL)
    {
        fwprintf(fout,_T(
" Id:%s  " ),strId);
    }
    SysFreeString(strName);
    SysFreeString(strId);
    SysFreeString(strTag);
    BSTR strAttrName1 
=  _T( " border " );
    BSTR strAttrName2 
=  _T( " bgcolor " );
    VARIANT val;

    pElement
-> getAttribute(strAttrName1, 2 , & val);
    
if  (val.vt  !=  VT_NULL)
    {
        
if  (val.bstrVal  !=  NULL)
        {
            fwprintf(fout,_T(
" border:%s  " ),val.bstrVal);
        }
    }


    pElement
-> getAttribute(strAttrName2, 2 , & val);
    
if  (val.vt  !=  VT_NULL)
    {
        
if  (val.bstrVal  !=  NULL)
        {
            fwprintf(fout,_T(
" bgcolor:%s  " ),val.bstrVal);
        }
    }

    
    fwprintf(fout,_T(
" \n " ));
}
// 将DOM树打印出来
void  Run(IHTMLElement  *  pElement, int  level)
{
    IHTMLElementCollection 
*  children;

    VisitNode(pElement,level);


    IDispatch
*  pDisp;
    pElement
-> get_children( & pDisp);
    pDisp
-> QueryInterface(IID_IHTMLElementCollection,( void ** ) & children);
    pDisp
-> Release();

    
long  len;
    children
-> get_length( & len);
    VARIANT dummy;
    dummy.vt 
=  VT_I4;
    
for  ( int  i  =   0 ;i  <  len;i ++ )
    {
        IHTMLElement
*  child;
        dummy.intVal 
=  i;
        children
-> item(dummy,dummy,(IDispatch ** ) & pDisp);
        pDisp
-> QueryInterface(IID_IHTMLElement,( void ** ) & child);
        pDisp
-> Release();
        Run(child,level 
+   1 );
        child
-> Release();
    }
    children
-> Release();
}
void  TestParse(IHTMLDocument2  *  pNewDoc)
{
    BSTR strText;
    IHTMLElement 
* pBody;
    pNewDoc
-> get_body( & pBody);
    pBody
-> get_innerText( & strText);
    wprintf(_T(
" %s\n " ),strText);
    SysFreeString(strText);
    

    pNewDoc
-> get_title( & strText);
    wprintf(_T(
" %s\n " ),strText);
    SysFreeString(strText);
    
    cout 
<<   " Run begin.... " << endl;
    Run(pBody,
0 );
    cout 
<<   " Run end.... " << endl;

    pBody
-> Release();

    
// FindAllElementHavingBg(pNewDoc);

}
void  TestMSHTML(wchar_t  *  wcontent)
{
    IHTMLDocument2 
* pDoc  =  NULL;
    CoInitialize(NULL);
    CoCreateInstance(CLSID_HTMLDocument, 
                     NULL, 
                     CLSCTX_INPROC_SERVER, 
                     IID_IHTMLDocument2, 
                    (LPVOID 
* & pDoc);

    
if  (pDoc)
    {
        IPersistStreamInit 
* pPersist  =  NULL;
        pDoc
-> QueryInterface(IID_IPersistStreamInit, 
                             (LPVOID 
* & pPersist);
        
if  (pPersist)
        {
            IMarkupServices 
* pMS  =  NULL;
            pPersist
-> InitNew();
            pPersist
-> Release();
            pDoc
-> QueryInterface(IID_IMarkupServices, 
                                (LPVOID 
* & pMS);

            
if  (pMS)
            {
                IMarkupContainer 
* pMC  =  NULL;
                IMarkupPointer 
* pMkStart  =  NULL;
                IMarkupPointer 
* pMkFinish  =  NULL;
                pMS
-> CreateMarkupPointer( & pMkStart);
                pMS
-> CreateMarkupPointer( & pMkFinish);
                pMS
-> ParseString(wcontent,
                    
0
                    
& pMC, 
                    pMkStart, 
                    pMkFinish);

                
if  (pMC)
                {
                    IHTMLDocument2 
* pNewDoc  =  NULL;

                    pMC
-> QueryInterface(IID_IHTMLDocument, 
                        (LPVOID 
* & pNewDoc);

                    
if  (pNewDoc)
                    {
                        
//  do anything with pNewDoc, in this case 
                        
//  get the body innerText.
                        TestParse(pNewDoc);
    
                        pNewDoc
-> Release();
                    }

                    pMC
-> Release();
                }

                
if  (pMkStart)
                    pMkStart
-> Release();

                
if  (pMkFinish)
                    pMkFinish
-> Release();

                pMS
-> Release();
            }
        }

        pDoc
-> Release();
    }

    CoUninitialize();

}

inline wchar_t
*  AnsiToUnicode(  const   char *  szStr )
{
    
int  nLen  =  MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr,  - 1 , NULL,  0  );
    
if  (nLen  ==   0 )
    {
        
return  NULL;
    }
    wchar_t
*  pResult  =   new  wchar_t[nLen + 1 ];
    MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, 
- 1 , pResult, nLen );
    pResult[nLen] 
=  L ' \0 ' ;
    
return  pResult;
}

// 调用者负责delete wcontent
wchar_t  *  ReadFromHtmlFile( string  str, string   &  content)
{
    ifstream fin(str.c_str());
    
string  line;
    
while (getline(fin,line))
    {
    
//     cout << line << endl;
        content  =  content  +  line;
    }
    
// cout << content << endl;
    
// cout << content.size() << endl;
    
// printf("original html code\n%s\n",content.c_str());
    wchar_t  *  wcontent  =  AnsiToUnicode(content.c_str()); 
    
// wprintf(L"after transferred\n%s\n",wcontent);
    
// delete[] wcontent;
    fin.close();
    fin.clear();
    
return  wcontent;
}

int  _tmain( int  argc, TCHAR *  argv[], TCHAR *  envp[])
{
    
int  nRetCode  =   0 ;

    
//  初始化 MFC 并在失败时显示错误
     if  ( ! AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(),  0 ))
    {
        
//  TODO: 更改错误代码以符合您的需要
        _tprintf(_T( " 错误: MFC 初始化失败\n " ));
        nRetCode 
=   1 ;
    }
    
else
    {
        fout 
=  fopen( " out.txt " , " w " );
        
string  str  =   " test.html " ;
        
string  content;
        wchar_t 
*  wcontent  =  ReadFromHtmlFile(str,content);
        
int  len  =  wcslen(wcontent);
        
// cout << len << endl;
        
        TestMSHTML(wcontent);
        delete[] wcontent;
        fclose(fout);
    }
    
    
return  nRetCode;
}
输入结果
TagName:BODY
 TagName:DIV
  TagName:TABLE bgcolor:#ff0000
   TagName:TBODY
    TagName:TR
     TagName:TD border:2 bgcolor:#ffff00
     TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
    TagName:TR
     TagName:TD
      TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
     TagName:TD
    TagName:TR
     TagName:TD
     TagName:TD

 

 

转载于:https://www.cnblogs.com/speedmancs/archive/2010/08/11/1797442.html

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值