这一段转至:http://www.cnblogs.com/speedmancs/archive/2010/08/11/1797442.html
测试用例
<html>
<head>
<title>
Just a Test
</title>
</head>
<body>
gaofeng hello!!
<div>
<table bgcolor="red">
<tr>
<td bgcolor="yellow" border="2">Name</td>
<td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td>
</tr>
<tr>
<td><p id="qualify2" class="blueBorder" bgcolor="blue" border="1">Surname</p></td>
<td></td>
</tr>
<tr>
<td>address</td>
<td></td>
</tr>
</table>
</div>
</body>
</html>
头文件:
#include <iostream>
#include <comdef.h>
#include <mshtml.h>
#include <string>
#include <fstream>
#include <vector>
#include <map>
#import <mshtml.tlb> no_auto_exclude
代码:
// TestMSHTML.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include "TestMSHTML.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
// 唯一的应用程序对象
CWinApp theApp;
FILE * fout;
using namespace std;
//OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");
typedef int BorderAttribute;
void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
{
IHTMLElement * pBody;
pNewDoc->get_body(&pBody);
pBody->Release();
}
void PrintTabs(int n)
{
for (int i = 0;i<n;i++)
{
//cout << '\t';
fwprintf(fout,_T("\t"));
}
}
void VisitNode(IHTMLElement* pElement,int level)
{
BSTR strName,strId,strTag;
PrintTabs(level);
pElement->get_className(&strName);
pElement->get_id(&strId);
pElement->get_tagName(&strTag);
if (strTag!=NULL)
{
fwprintf(fout,_T("TagName:%s "),strTag);
}
if (strName!=NULL)
{
fwprintf(fout,_T("className:%s "),strName);
}
if (strId != NULL)
{
fwprintf(fout,_T("Id:%s "),strId);
}
SysFreeString(strName);
SysFreeString(strId);
SysFreeString(strTag);
BSTR strAttrName1 = _T("border");
BSTR strAttrName2 = _T("bgcolor");
VARIANT val;
pElement->getAttribute(strAttrName1,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("border:%s "),val.bstrVal);
}
}
pElement->getAttribute(strAttrName2,2,&val);
if (val.vt != VT_NULL)
{
if (val.bstrVal != NULL)
{
fwprintf(fout,_T("bgcolor:%s "),val.bstrVal);
}
}
fwprintf(fout,_T("\n"));
}
//将DOM树打印出来
void Run(IHTMLElement * pElement,int level)
{
IHTMLElementCollection * children;
VisitNode(pElement,level);
IDispatch* pDisp;
pElement->get_children(&pDisp);
pDisp->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
pDisp->Release();
long len;
children->get_length(&len);
VARIANT dummy;
dummy.vt = VT_I4;
for (int i = 0;i < len;i++)
{
IHTMLElement* child;
dummy.intVal = i;
children->item(dummy,dummy,(IDispatch**)&pDisp);
pDisp->QueryInterface(IID_IHTMLElement,(void**)&child);
pDisp->Release();
Run(child,level + 1);
child->Release();
}
children->Release();
}
void TestParse(IHTMLDocument2 * pNewDoc)
{
BSTR strText;
IHTMLElement *pBody;
pNewDoc->get_body(&pBody);
pBody->get_innerText(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
pNewDoc->get_title(&strText);
wprintf(_T("%s\n"),strText);
SysFreeString(strText);
cout << "Run begin...."<<endl;
Run(pBody,0);
cout << "Run end...."<<endl;
pBody->Release();
//FindAllElementHavingBg(pNewDoc);
}
void TestMSHTML(wchar_t * wcontent)
{
IHTMLDocument2 *pDoc = NULL;
CoInitialize(NULL);
CoCreateInstance(CLSID_HTMLDocument,
NULL,
CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2,
(LPVOID *) &pDoc);
if (pDoc)
{
IPersistStreamInit *pPersist = NULL;
pDoc->QueryInterface(IID_IPersistStreamInit,
(LPVOID *) &pPersist);
if (pPersist)
{
IMarkupServices *pMS = NULL;
pPersist->InitNew();
pPersist->Release();
pDoc->QueryInterface(IID_IMarkupServices,
(LPVOID *) &pMS);
if (pMS)
{
IMarkupContainer *pMC = NULL;
IMarkupPointer *pMkStart = NULL;
IMarkupPointer *pMkFinish = NULL;
pMS->CreateMarkupPointer(&pMkStart);
pMS->CreateMarkupPointer(&pMkFinish);
pMS->ParseString(wcontent,
0,
&pMC,
pMkStart,
pMkFinish);
if (pMC)
{
IHTMLDocument2 *pNewDoc = NULL;
pMC->QueryInterface(IID_IHTMLDocument,
(LPVOID *) &pNewDoc);
if (pNewDoc)
{
// do anything with pNewDoc, in this case
// get the body innerText.
TestParse(pNewDoc);
pNewDoc->Release();
}
pMC->Release();
}
if (pMkStart)
pMkStart->Release();
if (pMkFinish)
pMkFinish->Release();
pMS->Release();
}
}
pDoc->Release();
}
CoUninitialize();
}
inline wchar_t* AnsiToUnicode( const char* szStr )
{
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
if (nLen == 0)
{
return NULL;
}
wchar_t* pResult = new wchar_t[nLen+1];
MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, pResult, nLen );
pResult[nLen] = L'\0';
return pResult;
}
//调用者负责delete wcontent
wchar_t * ReadFromHtmlFile(string str,string & content)
{
ifstream fin(str.c_str());
string line;
while(getline(fin,line))
{
// cout << line << endl;
content = content + line;
}
//cout << content << endl;
//cout << content.size() << endl;
//printf("original html code\n%s\n",content.c_str());
wchar_t * wcontent = AnsiToUnicode(content.c_str());
//wprintf(L"after transferred\n%s\n",wcontent);
//delete[] wcontent;
fin.close();
fin.clear();
return wcontent;
}
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
fout = fopen("out.txt","w");
string str = "test.html";
string content;
wchar_t * wcontent = ReadFromHtmlFile(str,content);
int len = wcslen(wcontent);
//cout << len << endl;
TestMSHTML(wcontent);
delete[] wcontent;
fclose(fout);
}
return nRetCode;
}
输出结果:
TagName:BODY
TagName:DIV
TagName:TABLE bgcolor:#ff0000
TagName:TBODY
TagName:TR
TagName:TD border:2 bgcolor:#ffff00
TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
TagName:TR
TagName:TD
TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
TagName:TD
TagName:TR
TagName:TD
TagName:TD
电脑使用WIN7,根据以上代码我修改出了自己的代码,但是在vc6中编译时提示: IID_IMarkupServices未定义,于是打开头文件:C:\Program Files\Microsoft Visual Studio\VC98\Include\MSHTML.H (vc6中存放位置)与vs2008的头文件进行对比,发现vc6中IID_IMarkupServices未定义,再打开OLE/COM Object Viewer(Microsoft Visual Studio 6.0->Microsoft Visual Studio 6.0 Tools->OLE Tools)->Type Libraries,找到Microsoft HTML Object Library(Ver 4.0), 搜索IMarkupServices2 找到如下内容:
[
odl,
uuid(3050F682-98B5-11CF-BB82-00AA00BDCE0B)
]
interface IMarkupServices2 : IMarkupServices {
HRESULT _stdcall ParseGlobalEx(
[in] wireHGLOBAL hglobalHTML,
[in] unsigned long dwFlags,
[in] IMarkupContainer* pContext,
[out] IMarkupContainer** ppContainerResult,
[in] IMarkupPointer* pPointerStart,
[in] IMarkupPointer* pPointerFinish);
HRESULT _stdcall ValidateElements(
[in] IMarkupPointer* pPointerStart,
[in] IMarkupPointer* pPointerFinish,
[in] IMarkupPointer* pPointerTarget,
[in, out] IMarkupPointer* pPointerStatus,
[out] IHTMLElement** ppElemFailBottom,
[out] IHTMLElement** ppElemFailTop);
HRESULT _stdcall SaveSegmentsToClipboard(
[in] ISegmentList* pSegmentList,
[in] unsigned long dwFlags);
};
说明在WIN7下面IMarkupServices2的 地址是3050F682-98B5-11CF-BB82-00AA00BDCE0B,
所以在我的文件里面添加了如下代码:
extern "C" const GUID __declspec(selectany) IID_IMarkupServices =
{0x3050F682,0x98B5,0x11CF,{0xBB,0x82,0x00,0xAA,0x00,0xBD,0xCE,0x0B}};
再次编译就OK了;
解析的时候注意要把open的网页转换为 wchar_t 类型的