使用LIBTIDY解析HTML文档的小例子一个

最新推荐文章于 2022-04-06 11:11:58 发布

jfkidear

最新推荐文章于 2022-04-06 11:11:58 发布

阅读量2.6k

点赞数

分类专栏：函数文章标签： html 文档 algorithm buffer file string

函数专栏收录该内容

74 篇文章 1 订阅

订阅专栏

使用LIBTIDY解析HTML文档的小例子一个

libtidy是一个开源的用来诊断,分析,生成html文档的一个库
下面的例子是使用libtidy获取页面链接的例子
代码如下:

#ifndef PARSEPAGE_HPP
#define PARFSPAGE_HPP
#include < string >
#include < vector >
#include < tidy / buffio.h >
#include < tidy / fileio.h >
#include < tidy / tidy.h >
#include < tidy / tidyenum.h >
#include < tidy / platform.h >

// ! 解析html页面
class ParsePage
{
public :
    typedef std::vector < std:: string > String;
public :
    ParsePage( int rank = 0 , const std:: string & cur = "" ):rank(rank),cur(cur)
    {
        doc = tidyCreate();
        root = tidyGetRoot(doc);
    }
     ~ ParsePage()
    {
        tidyRelease(doc);
    }
public :
     // ! 解析给定文件
     bool LoadFile( const char * file)
    {
         return 1 == tidyParseFile(doc,file);
    }
     // !解析给定内存
     bool LoadBuffer( const char * buffer)
    {
         return 1 == tidyParseString(doc,buffer);
    }
     // ! 内容解析
     void Check()
    {
        CheckHref(root);
    }
     // ! 获取链接
     int GetLinkNumber() const { return links.size();}
    std:: string GetLinkByIndex( int index){ return links.at(index);}
private :
     void DoHref(TidyAttr attr);
     void CheckHref(TidyNode node);
private :
    TidyDoc     doc;
    TidyNode    root;
    std:: string cur;
     int          rank;
    String      links;
};

#endif
// ! ccsdu2004

实现:

#include < boost / algorithm / string .hpp >
#include " parsepage.hpp "

void ParsePage::DoHref(TidyAttr attr)
{
    std:: string href(tidyAttrValue(attr));
     // ! 邮箱地址
     if (boost::algorithm::starts_with(href, " mailto: " ))
    {
    }
     // ! 链接地址
     else
    {
         if (boost::algorithm::starts_with(href, " http: " ))
        {
            size_t itr = href.find_last_of( ' # ' );
             if (itr != std:: string ::npos)
            {
                href = href.substr( 0 ,itr);
            }
        }
         else
        {
             if (boost::algorithm::contains(href, " # " ))
                 return ;
        }

        links.push_back(href);
    }
}

void ParsePage::CheckHref(TidyNode node)
{
    TidyNode child;
     for (child = tidyGetChild(node);child;child = tidyGetNext(child))
    {
        TidyAttr attr = tidyAttrGetHREF(child);
         if (attr)
        {
            DoHref(attr);
        }
        CheckHref(child);
    }
}

这个对象比较简单
调用Check之后所有的页面链接在links中

jfkidear

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用LIBTIDY解析HTML文档的小例子一个

使用LIBTIDY解析HTML文档的小例子一个libtidy是一个开源的用来诊断,分析,生成html文档的一个库下面的例子是使用libtidy获取页面链接的例子代码如下:#ifndef PARSEPAGE_HPP#define PARFSPAGE_HPP#include string>#include vector>#include tidy/buffio.
复制链接

扫一扫

专栏目录