xgboost源码

最新推荐文章于 2024-06-19 12:41:17 发布

unshaven111

最新推荐文章于 2024-06-19 12:41:17 发布

阅读量615

点赞数

分类专栏： xgboost源码分析文章标签：机器学习

本文链接：https://blog.csdn.net/qq_38251888/article/details/105657472

版权

xgboost源码分析专栏收录该内容

2 篇文章 0 订阅

订阅专栏

xgboost源码中处理空格的函数

static std::string TrimWhitespace(const std::string& str) {
    const auto first_char = str.find_first_not_of(" \t\n\r");
    const auto last_char = str.find_last_not_of(" \t\n\r");
    if (first_char == std::string::npos) {
        // Every character in str is a whitespace
        return std::string();
    }
//    CHECK_NE(last_char, std::string::npos);
    const auto substr_len = last_char + 1 - first_char;
    return str.substr(first_char, substr_len);
}

xgboost中的正则表达式：

  explicit ConfigParser(const std::string path)
      : path_(std::move(path)),
      line_comment_regex_("^#"),
      key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),
      key_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*=)rx"),
      value_regex_(R"rx(^([^#"'\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx"),
      value_regex_escaped_(R"rx(^(["'])([^"'\r\n]+)\1[\t ]*(?:#.*){0,1}$)rx")
  {}

R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx")

最前面的^表示以这个分组开头，()中的 ^是非的意思，就是不以#、"、‘等字符开头。
+表示前面的分组重复1次或多次。

//
// Created by 吴际 on 2020/4/20.
//



#include <iostream>
#include <fstream>
#include <sstream>
#include <regex>
std::string LoadConfigFile(const std::string& path) {
    std::ifstream fin(path, std::ios_base::in | std::ios_base::binary);
    std::string content{std::istreambuf_iterator<char>(fin),
                        std::istreambuf_iterator<char>()};
    return content;
}
static std::string TrimWhitespace(const std::string& str) {
    const auto first_char = str.find_first_not_of(" \t\n\r");
    const auto last_char = str.find_last_not_of(" \t\n\r");
    if (first_char == std::string::npos) {
        // Every character in str is a whitespace
        return std::string();
    }
//    CHECK_NE(last_char, std::string::npos);
    const auto substr_len = last_char + 1 - first_char;
    return str.substr(first_char, substr_len);
}

int main(int argc,char * argv[]){
    std::string path = "/Users/wuji/Documents/xgboost/mushroom.conf";
    std::string s = LoadConfigFile(path);
//    std::cout << s << std::endl;
    std::stringstream content{s};
//    std::string out;
//    content>>out;
//    std::cout << out << std::endl;
//    key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),

//    const std::regex key_regex_(R"rx(^([^#"'=\\r\\n\\t ]+)[\\t ]*=)rx");
//    const std::regex key_regex_(R"rx(([^]+[\\t]*=))rx");
    const std::regex key_regex_(R"rx(^([^#=\r\n\t ]+)[\t ]*=)rx");
    const std::regex value_regex_(R"rx(^([^#"'\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx");
    std::string test_regex = "booster = gbtree";
//    std::string test_regex = "ba ";
    std::smatch m;
    std::cout << std::regex_search(test_regex,m,key_regex_) << std::endl;
    std::cout << "prefix=" << m.prefix() << std::endl;
    std::string key;
    std::string value;
    for (int i = 0; i < m.size(); ++i) {
        std::cout << "sm[" << i << "]: " << m[i].str() << std::endl;
    }
    std::cout << "sm.prefix: " << m.suffix() << std::endl;
    key = m[1].str();
    value = m.suffix().str();
    value = TrimWhitespace(value);
    std::vector<std::pair<std::string,std::string> > v;
    for (int i = 0;i < m.size(); ++i) {
        v.emplace_back(key,value);
    }
    for (const auto &p : v){
        std::cout << "first=" << p.first << "\nsecond=" << p.second << std::endl;
    }
}

结果
1
prefix=
sm[0]: booster =
sm[1]: booster
sm.prefix: gbtree
first=booster
second=gbtree
first=booster
second=gbtree

unshaven111

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
xgboost源码

xgboost源码中处理空格的函数static std::string TrimWhitespace(const std::string& str) { const auto first_char = str.find_first_not_of(" \t\n\r"); const auto last_char = str.find_last_not_of(" \t\n\r...
复制链接

扫一扫