html parser c++,跨平台C++ Html Parser, basehtmlparser.h

#ifndef _BaseHtmlParser_H_

#define _BaseHtmlParser_H_

#include

#include

using namespace std;

class CBaseHtmlParser

{

public:

//!struct to save a string, with a pointer and a size

struct SZ_STRING

{

const char* pbData;

size_t cbData;

};

public:

CBaseHtmlParser();

virtual ~CBaseHtmlParser();

//Init or Destroy, do nothing here now

virtual void

Initialize(){}

virtual void

Destroy(){}

//Two interface to parser html page

virtual bool Parse(const string& URL,const

string& Content);

virtual bool Parse(const SZ_STRING

&strUrl,const SZ_STRING

&strContent);

//Util api for get a absolute url based on current

page

void Relativity2AbsoluteURL(string& URL);

//event when a tag begin, such as '

href=..' , then strTagName is 'a', Attribs contains 'href'

virtual void OnStartTag(const SZ_STRING &

strTagName,vector<

pair

>

Attribs){};

//event when a tag close, such as

'

', then strTagName is 'a'

virtual void OnEndTag(const SZ_STRING &

strTagName){};

//event when text between tags, such as

'<>hello<>',

then strData is 'hello'

virtual void OnData(const SZ_STRING &

strData){};

//event when script or comment, such as '' or ''

virtual void OnComment(const SZ_STRING &

strComment){};

private:

char m_szBaseURL[1024];

char m_szBaseDomain[1024];

};

#endif

跨平台C++ Html Parser, basehtmlparser.cpp

(2005-05-26 11:40:49)

#include "basehtmlparser.h"

#ifndef WIN32

#define strnicmp strncasecmp

#endif

CBaseHtmlParser::CBaseHtmlParser()

{

}

CBaseHtmlParser::~CBaseHtmlParser()

{

}

void

CBaseHtmlParser::Relativity2AbsoluteURL(string&

URL)

{

int pos=-1;

pos=URL.rfind("#");

if(pos!=string::npos)

{

URL=URL.substr(0,pos);

}

do

{

pos=URL.find("&");

if(pos!=string::npos)

URL=URL.substr(0,pos)+"&"+URL.substr(pos+5);

}while(pos>=0);

do

{

pos=URL.find(">");

if(pos!=string::npos)

URL=URL.substr(0,pos)+">"+URL.substr(pos+4);

}while(pos>=0);

do

{

pos=URL.find("<");

if(pos!=string::npos)

URL=URL.substr(0,pos)+"

}while(pos>=0);

if((URL.length()>=1)&&(URL[0]=='/'))

{

URL=m_szBaseDomain+URL;

}else

if((URL.length()>=7)&&(strnicmp((char*)URL.c_str(),"http://",7)==0))

{

return;

}else

{

URL=m_szBaseURL+URL;

}

}

bool

CBaseHtmlParser::Parse(const

string& URL,const string&

Content)

{

SZ_STRING strUrl,strContent;

strUrl.pbData=URL.c_str();

strUrl.cbData=URL.length();

strContent.pbData=Content.c_str();

strContent.cbData=Content.length();

return Parse(strUrl,strContent);

}

bool

CBaseHtmlParser::Parse(const

SZ_STRING &strUrl,const SZ_STRING

&strContent)

{

sprintf(m_szBaseURL,"%.*s",strUrl.cbData,strUrl.pbData);

char* pend=strrchr(m_szBaseURL,'/');

if(pend!=NULL)

{

pend++;

*pend='\0';

}

sprintf(m_szBaseDomain,"%.*s",strUrl.cbData,strUrl.pbData);

pend=strchr(m_szBaseDomain+strlen("http://")+1,'/');

if(pend!=NULL)

{

*pend='\0';

}

size_t i;

size_t nContent=strContent.cbData;

const char* pContent=(const char*)(strContent.pbData);

for(i=0;i

{

if(pContent[i]=='

{

if(nContent>4)

if((i

start

{

i+=4;

size_t nCommentStart=i;

if(nContent>3)

while(i

{

if((pContent[i]=='-')&&(pContent[i+1]=='-')&&(pContent[i+2]=='>'))//comment

end

{

SZ_STRING strComment;

strComment.pbData=pContent+nCommentStart;

strComment.cbData=i-nCommentStart;

OnComment(strComment);

i+=3;

break;

}

i++;

}

continue;

}

//tag here

size_t nTagNameStart=i+1;

while((nTagNameStart

'))

nTagNameStart++;

size_t nTagNameEnd=nTagNameStart+1;

while((nTagNameEnd

')&&(pContent[nTagNameEnd]!='>'))

nTagNameEnd++;

SZ_STRING strTagName;

strTagName.pbData = pContent+nTagNameStart;

strTagName.cbData =

nTagNameEnd-nTagNameStart;

size_t nTagEnd=nTagNameEnd;

while((nTagEnd'))

nTagEnd++;

nTagEnd++;

const char* pTag=pContent+i;

size_t nTag=nTagEnd-i;

i=nTagEnd;

vector<

pair

> Attribs;

if((strTagName.cbData

==6)&&(strnicmp((char*)strTagName.pbData,"script",6)==0))//

{

size_t nScriptStart=i;

if(nContent>8)

while(i

{

if(strnicmp((char*)pContent+i,"

{

OnStartTag(strTagName,Attribs);

SZ_STRING strComment;

strComment.pbData =

pContent+nScriptStart;

strComment.cbData =

i-nScriptStart;

OnComment(strComment);

OnEndTag(strTagName);

i+=8;

break;

}

i++;

}

while((i'))

i++;

i++;

continue;

}

if(strTagName.pbData[0]=='/')

{

strTagName.pbData +=1;

strTagName.cbData -=1;

OnEndTag(strTagName);

continue;

}

size_t m=strTagName.cbData+1;

do

{

while((m

')||((char)pTag[m]=='\r')||((char)pTag[m]=='\n')||((char)pTag[m]=='\t')))

m++;

size_t nAttribStart=m;

while((m

m++;

size_t nAttribStop=m-1;

while((nAttribStop>0)&&((pTag[nAttribStop]=='

')||(pTag[nAttribStop]=='\r')||(pTag[nAttribStop]=='\n')||(pTag[nAttribStop]=='\t')))

nAttribStop--;

if(nAttribStop>0)

nAttribStop++;

SZ_STRING strAttribName;

strAttribName.pbData

=pTag+nAttribStart;

strAttribName.cbData

=nAttribStop-nAttribStart;

m++;

bool bStartWithDQ=false;

bool bStartWithQ=false;

bool bStartWithSpace=false;

if((m

{

bStartWithDQ=true;

m++;

}else

if((m

{

bStartWithQ=true;

m++;

}else

bStartWithSpace=true;

if(m>=nTag)

continue;

size_t nValueStart=m;

while((m

&&((char)pTag[m]!='>')

&&((char)pTag[m]!='\r')

&&((char)pTag[m]!='\n')

&&(!bStartWithDQ||((char)pTag[m]!='\"'))

&&(!bStartWithQ||((char)pTag[m]!='\''))

&&(!bStartWithSpace||((char)pTag[m]!='

')))

m++;

SZ_STRING strAttribValue;

strAttribValue.pbData

=pTag+nValueStart;

strAttribValue.cbData =m-nValueStart;

if(strAttribName.cbData !=0)

Attribs.push_back(pair(strAttribName,strAttribValue));

while((m

'))

m++;

}while(m

OnStartTag(strTagName,Attribs);

}else

{

while((i

')||(pContent[i]=='\r')||(pContent[i]=='\n')||(pContent[i]=='\t')))

i++;

size_t nTextBegin=i;

while((i

i++;

size_t nTextEnd=i;

while((nTextEnd>=nTextBegin)&&((pContent[nTextEnd-1]=='

')||(pContent[nTextEnd-1]=='\r')||(pContent[nTextEnd-1]=='\n')||(pContent[nTextEnd-1]=='\t')))

nTextEnd--;

if(nTextEnd<=nTextBegin)

continue;

SZ_STRING strData;

strData.pbData =pContent+nTextBegin;

strData.cbData =nTextEnd-nTextBegin;

OnData(strData);

}

}

return true;

}

有关Html Parser的使用范例, URL解析器

(2005-05-26 11:42:21)

解析完之后,所有的URL存在m_URL2Text表中

#ifndef _URL_PARSER_H_

#define _URL_PARSER_H_

#include "basehtmlparser.h"

#include

#include

using namespace std;

class CURLParser: public CBaseHtmlParser

{

public:

CURLParser(){m_bInTagA=false;}

bool Parse(const string& URL,const

string& Content);

public: map

m_URL2Text;

private:

bool m_bInTagA;

string m_strCurURL;

void OnStartTag(const SZ_STRING &

strTagName,vector<

pair

> Attribs);

void OnEndTag(const SZ_STRING &

strTagName);

void OnData(const SZ_STRING & strData);

};

#endif

#include "urlparser.h"

#ifndef WIN32

#define strnicmp strncasecmp

#endif

bool CURLParser::Parse(const

string& URL,const string&

Content)

{

m_URL2Text.clear();

return

CBaseHtmlParser::Parse(URL,Content);

}

void

CURLParser::OnStartTag(const

SZ_STRING & strTagName,vector<

pair

> Attribs)

{

if((strTagName.cbData==1)&&(strnicmp(strTagName.pbData,"A",strTagName.cbData)==0))

{

m_bInTagA=true;

m_strCurURL.clear();

for(size_t

i=0;i

{

SZ_STRING x=Attribs[i].first;

if((Attribs[i].first.cbData==4)&&(strnicmp(Attribs[i].first.pbData,"href",Attribs[i].first.cbData)==0))

{

m_strCurURL=string(Attribs[i].second.pbData,Attribs[i].second.cbData);

Relativity2AbsoluteURL(m_strCurURL);

break;

}

}

}

}

void CURLParser::OnEndTag(const

SZ_STRING & strTagName)

{

if((strTagName.cbData==1)&&(strnicmp(strTagName.pbData,"A",strTagName.cbData)==0))

{

m_bInTagA=false;

}

if((strTagName.cbData==2)&&(strnicmp(strTagName.pbData,"td",strTagName.cbData)==0))

{

m_bInTagA=false;

}

}

void CURLParser::OnData(const

SZ_STRING & strData)

{

if(m_bInTagA)

{

if(!m_strCurURL.empty())

m_URL2Text[m_strCurURL]=m_URL2Text[m_strCurURL]+string(strData.pbData,strData.cbData);

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值