#ifndef _BaseHtmlParser_H_
#define _BaseHtmlParser_H_
#include
#include
using namespace std;
class CBaseHtmlParser
{
public:
//!struct to save a string, with a pointer and a size
struct SZ_STRING
{
const char* pbData;
size_t cbData;
};
public:
CBaseHtmlParser();
virtual ~CBaseHtmlParser();
//Init or Destroy, do nothing here now
virtual void
Initialize(){}
virtual void
Destroy(){}
//Two interface to parser html page
virtual bool Parse(const string& URL,const
string& Content);
virtual bool Parse(const SZ_STRING
&strUrl,const SZ_STRING
&strContent);
//Util api for get a absolute url based on current
page
void Relativity2AbsoluteURL(string& URL);
//event when a tag begin, such as '
href=..' , then strTagName is 'a', Attribs contains 'href'
virtual void OnStartTag(const SZ_STRING &
strTagName,vector<
pair
>
Attribs){};
//event when a tag close, such as
'
', then strTagName is 'a'virtual void OnEndTag(const SZ_STRING &
strTagName){};
//event when text between tags, such as
'<>hello<>',
then strData is 'hello'
virtual void OnData(const SZ_STRING &
strData){};
//event when script or comment, such as '' or ''
virtual void OnComment(const SZ_STRING &
strComment){};
private:
char m_szBaseURL[1024];
char m_szBaseDomain[1024];
};
#endif
跨平台C++ Html Parser, basehtmlparser.cpp
(2005-05-26 11:40:49)
#include "basehtmlparser.h"
#ifndef WIN32
#define strnicmp strncasecmp
#endif
CBaseHtmlParser::CBaseHtmlParser()
{
}
CBaseHtmlParser::~CBaseHtmlParser()
{
}
void
CBaseHtmlParser::Relativity2AbsoluteURL(string&
URL)
{
int pos=-1;
pos=URL.rfind("#");
if(pos!=string::npos)
{
URL=URL.substr(0,pos);
}
do
{
pos=URL.find("&");
if(pos!=string::npos)
URL=URL.substr(0,pos)+"&"+URL.substr(pos+5);
}while(pos>=0);
do
{
pos=URL.find(">");
if(pos!=string::npos)
URL=URL.substr(0,pos)+">"+URL.substr(pos+4);
}while(pos>=0);
do
{
pos=URL.find("<");
if(pos!=string::npos)
URL=URL.substr(0,pos)+"
}while(pos>=0);
if((URL.length()>=1)&&(URL[0]=='/'))
{
URL=m_szBaseDomain+URL;
}else
if((URL.length()>=7)&&(strnicmp((char*)URL.c_str(),"http://",7)==0))
{
return;
}else
{
URL=m_szBaseURL+URL;
}
}
bool
CBaseHtmlParser::Parse(const
string& URL,const string&
Content)
{
SZ_STRING strUrl,strContent;
strUrl.pbData=URL.c_str();
strUrl.cbData=URL.length();
strContent.pbData=Content.c_str();
strContent.cbData=Content.length();
return Parse(strUrl,strContent);
}
bool
CBaseHtmlParser::Parse(const
SZ_STRING &strUrl,const SZ_STRING
&strContent)
{
sprintf(m_szBaseURL,"%.*s",strUrl.cbData,strUrl.pbData);
char* pend=strrchr(m_szBaseURL,'/');
if(pend!=NULL)
{
pend++;
*pend='\0';
}
sprintf(m_szBaseDomain,"%.*s",strUrl.cbData,strUrl.pbData);
pend=strchr(m_szBaseDomain+strlen("http://")+1,'/');
if(pend!=NULL)
{
*pend='\0';
}
size_t i;
size_t nContent=strContent.cbData;
const char* pContent=(const char*)(strContent.pbData);
for(i=0;i
{
if(pContent[i]=='
{
if(nContent>4)
if((i
start
{
i+=4;
size_t nCommentStart=i;
if(nContent>3)
while(i
{
if((pContent[i]=='-')&&(pContent[i+1]=='-')&&(pContent[i+2]=='>'))//comment
end
{
SZ_STRING strComment;
strComment.pbData=pContent+nCommentStart;
strComment.cbData=i-nCommentStart;
OnComment(strComment);
i+=3;
break;
}
i++;
}
continue;
}
//tag here
size_t nTagNameStart=i+1;
while((nTagNameStart
'))
nTagNameStart++;
size_t nTagNameEnd=nTagNameStart+1;
while((nTagNameEnd
')&&(pContent[nTagNameEnd]!='>'))
nTagNameEnd++;
SZ_STRING strTagName;
strTagName.pbData = pContent+nTagNameStart;
strTagName.cbData =
nTagNameEnd-nTagNameStart;
size_t nTagEnd=nTagNameEnd;
while((nTagEnd'))
nTagEnd++;
nTagEnd++;
const char* pTag=pContent+i;
size_t nTag=nTagEnd-i;
i=nTagEnd;
vector<
pair
> Attribs;
if((strTagName.cbData
==6)&&(strnicmp((char*)strTagName.pbData,"script",6)==0))//
{
size_t nScriptStart=i;
if(nContent>8)
while(i
{
if(strnicmp((char*)pContent+i,"
{
OnStartTag(strTagName,Attribs);
SZ_STRING strComment;
strComment.pbData =
pContent+nScriptStart;
strComment.cbData =
i-nScriptStart;
OnComment(strComment);
OnEndTag(strTagName);
i+=8;
break;
}
i++;
}
while((i'))
i++;
i++;
continue;
}
if(strTagName.pbData[0]=='/')
{
strTagName.pbData +=1;
strTagName.cbData -=1;
OnEndTag(strTagName);
continue;
}
size_t m=strTagName.cbData+1;
do
{
while((m
')||((char)pTag[m]=='\r')||((char)pTag[m]=='\n')||((char)pTag[m]=='\t')))
m++;
size_t nAttribStart=m;
while((m
m++;
size_t nAttribStop=m-1;
while((nAttribStop>0)&&((pTag[nAttribStop]=='
')||(pTag[nAttribStop]=='\r')||(pTag[nAttribStop]=='\n')||(pTag[nAttribStop]=='\t')))
nAttribStop--;
if(nAttribStop>0)
nAttribStop++;
SZ_STRING strAttribName;
strAttribName.pbData
=pTag+nAttribStart;
strAttribName.cbData
=nAttribStop-nAttribStart;
m++;
bool bStartWithDQ=false;
bool bStartWithQ=false;
bool bStartWithSpace=false;
if((m
{
bStartWithDQ=true;
m++;
}else
if((m
{
bStartWithQ=true;
m++;
}else
bStartWithSpace=true;
if(m>=nTag)
continue;
size_t nValueStart=m;
while((m
&&((char)pTag[m]!='>')
&&((char)pTag[m]!='\r')
&&((char)pTag[m]!='\n')
&&(!bStartWithDQ||((char)pTag[m]!='\"'))
&&(!bStartWithQ||((char)pTag[m]!='\''))
&&(!bStartWithSpace||((char)pTag[m]!='
')))
m++;
SZ_STRING strAttribValue;
strAttribValue.pbData
=pTag+nValueStart;
strAttribValue.cbData =m-nValueStart;
if(strAttribName.cbData !=0)
Attribs.push_back(pair(strAttribName,strAttribValue));
while((m
'))
m++;
}while(m
OnStartTag(strTagName,Attribs);
}else
{
while((i
')||(pContent[i]=='\r')||(pContent[i]=='\n')||(pContent[i]=='\t')))
i++;
size_t nTextBegin=i;
while((i
i++;
size_t nTextEnd=i;
while((nTextEnd>=nTextBegin)&&((pContent[nTextEnd-1]=='
')||(pContent[nTextEnd-1]=='\r')||(pContent[nTextEnd-1]=='\n')||(pContent[nTextEnd-1]=='\t')))
nTextEnd--;
if(nTextEnd<=nTextBegin)
continue;
SZ_STRING strData;
strData.pbData =pContent+nTextBegin;
strData.cbData =nTextEnd-nTextBegin;
OnData(strData);
}
}
return true;
}
有关Html Parser的使用范例, URL解析器
(2005-05-26 11:42:21)
解析完之后,所有的URL存在m_URL2Text表中
#ifndef _URL_PARSER_H_
#define _URL_PARSER_H_
#include "basehtmlparser.h"
#include
#include
using namespace std;
class CURLParser: public CBaseHtmlParser
{
public:
CURLParser(){m_bInTagA=false;}
bool Parse(const string& URL,const
string& Content);
public: map
m_URL2Text;
private:
bool m_bInTagA;
string m_strCurURL;
void OnStartTag(const SZ_STRING &
strTagName,vector<
pair
> Attribs);
void OnEndTag(const SZ_STRING &
strTagName);
void OnData(const SZ_STRING & strData);
};
#endif
#include "urlparser.h"
#ifndef WIN32
#define strnicmp strncasecmp
#endif
bool CURLParser::Parse(const
string& URL,const string&
Content)
{
m_URL2Text.clear();
return
CBaseHtmlParser::Parse(URL,Content);
}
void
CURLParser::OnStartTag(const
SZ_STRING & strTagName,vector<
pair
> Attribs)
{
if((strTagName.cbData==1)&&(strnicmp(strTagName.pbData,"A",strTagName.cbData)==0))
{
m_bInTagA=true;
m_strCurURL.clear();
for(size_t
i=0;i
{
SZ_STRING x=Attribs[i].first;
if((Attribs[i].first.cbData==4)&&(strnicmp(Attribs[i].first.pbData,"href",Attribs[i].first.cbData)==0))
{
m_strCurURL=string(Attribs[i].second.pbData,Attribs[i].second.cbData);
Relativity2AbsoluteURL(m_strCurURL);
break;
}
}
}
}
void CURLParser::OnEndTag(const
SZ_STRING & strTagName)
{
if((strTagName.cbData==1)&&(strnicmp(strTagName.pbData,"A",strTagName.cbData)==0))
{
m_bInTagA=false;
}
if((strTagName.cbData==2)&&(strnicmp(strTagName.pbData,"td",strTagName.cbData)==0))
{
m_bInTagA=false;
}
}
void CURLParser::OnData(const
SZ_STRING & strData)
{
if(m_bInTagA)
{
if(!m_strCurURL.empty())
m_URL2Text[m_strCurURL]=m_URL2Text[m_strCurURL]+string(strData.pbData,strData.cbData);
}
}