boost——字符串与文本处理tokenizer

最新推荐文章于 2021-10-06 07:41:13 发布

竹杖芒鞋轻胜马，谁怕？一蓑烟雨任平生。

最新推荐文章于 2021-10-06 07:41:13 发布

阅读量563

点赞数

分类专栏： boost boost从入门到精通文章标签： boost tokenizer 文本处理

本文链接：https://blog.csdn.net/qq_21127151/article/details/53572385

版权

boost 同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

boost从入门到精通

9 篇文章 5 订阅

订阅专栏

#include <iostream>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <algorithm>
#include <boost/tokenizer.hpp>
#include <boost/typeof/typeof.hpp>
#include <string.h>

using namespace boost;
using namespace std;
template<typename T>
void print(T& tok)
{
    for(BOOST_AUTO(pos,tok.begin());pos != tok.end();++pos){
        cout << "["<< *pos << "] ";
    }
    cout << endl;
}
int main( int argc,char **argv)
{
    /*tokenizer默认把所有的空格和标点符号作为分隔符，因此分割出的只是单词，和string_algo::split的算法有区别*/
    string str("link raise the master-swaord");
    tokenizer<> tok(str);
    for(BOOST_AUTO(pos,tok.begin());pos != tok.end();++pos){
        cout << "["<< *pos << "] ";
    }
    cout << endl;

    /*分词函数对象：
     *tokenizer提供四个分词对象：
     *1.char_delimiters_separator：使用标点符号分词，是tokeizer默认使用的分词函数对象，已经被声明废弃；
     *2.char_separator:支持一个字符集合作为分隔符，默认标点符号分词；
     *3.escaped_list_separator:使用逗号分隔的分词；
     *4.off_separator:使用偏移量来分词，分解平文件格式的字符串很有用
     */

    /*char_separator(const char *dropped_delims,
    const char* kept_delims = 0,
    empty_token_policy empty_tokens = drop_empty_tokens)；
     * 1.第一个参数是分隔符集合，此集合中的字符不会作为分词的结果出现
     * 2.第二个参数也是分隔符集合，但是其中的字符会保留在分隔结果中
     * 3.第三个参数类似split算法中的eCompress参数，处理连续两个出现的分隔符,一个空字符串， 
     * 相当于split算法中token_compress_off值；
若为drop_empty_tokens，则空白单词不会作为分词的结果。
     */
    char* str2 = "Link ;;::<mom-dad> zebbix";
    char_separator<char> csp;
    tokenizer<char_separator<char>,char*> tok2(str2,str2 + strlen(str2,csp));
    print(tok2);

    tok2..assign(str2,str2+strlen(str2),char_separator<char>(" ;-","<>"));
    print(tok2);

    tok2..assign(str2,str2+strlen(str2),char_separator<char>(" ;-<>","",keep_empty_tokens));
    print(tok2);
    /*explicit escaped_list_separator( Char  e = ' \\' ，Char c = ' ,',
    Char  q = ' \" ' ) : escape_( 1,e) , c_( 1,c) , quote_( 1,q) , last_( false)  {}
     * 其构造函数一般取默认值：
     * 1.第一个参数e指定了字符中的转义字符，默认是斜杠(\);
     * 2.第二个参数是分隔符，默认是逗号；
     * 3.第三个参数是引号字符，默认是";
    */

    string str1 = "id,100,name,\"Tony\"";
    escaped_list_separator<char> esp;
    tokenizer<escaped_list_separator<char> > tok1(str1,esp);
    print(tok1);

     return (0);
}