xgboost源码中处理空格的函数
static std::string TrimWhitespace(const std::string& str) {
const auto first_char = str.find_first_not_of(" \t\n\r");
const auto last_char = str.find_last_not_of(" \t\n\r");
if (first_char == std::string::npos) {
// Every character in str is a whitespace
return std::string();
}
// CHECK_NE(last_char, std::string::npos);
const auto substr_len = last_char + 1 - first_char;
return str.substr(first_char, substr_len);
}
xgboost中的正则表达式:
explicit ConfigParser(const std::string path)
: path_(std::move(path)),
line_comment_regex_("^#"),
key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),
key_regex_escaped_(R"rx(^(["'])([^"'=\r\n]+)\1[\t ]*=)rx"),
value_regex_(R"rx(^([^#"'\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx"),
value_regex_escaped_(R"rx(^(["'])([^"'\r\n]+)\1[\t ]*(?:#.*){0,1}$)rx")
{}
R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx")
最前面的^表示以这个分组开头,()中的 ^是非的意思,就是不以#、"、‘等字符开头。
+表示前面的分组重复1次或多次。
//
// Created by 吴际 on 2020/4/20.
//
#include <iostream>
#include <fstream>
#include <sstream>
#include <regex>
std::string LoadConfigFile(const std::string& path) {
std::ifstream fin(path, std::ios_base::in | std::ios_base::binary);
std::string content{std::istreambuf_iterator<char>(fin),
std::istreambuf_iterator<char>()};
return content;
}
static std::string TrimWhitespace(const std::string& str) {
const auto first_char = str.find_first_not_of(" \t\n\r");
const auto last_char = str.find_last_not_of(" \t\n\r");
if (first_char == std::string::npos) {
// Every character in str is a whitespace
return std::string();
}
// CHECK_NE(last_char, std::string::npos);
const auto substr_len = last_char + 1 - first_char;
return str.substr(first_char, substr_len);
}
int main(int argc,char * argv[]){
std::string path = "/Users/wuji/Documents/xgboost/mushroom.conf";
std::string s = LoadConfigFile(path);
// std::cout << s << std::endl;
std::stringstream content{s};
// std::string out;
// content>>out;
// std::cout << out << std::endl;
// key_regex_(R"rx(^([^#"'=\r\n\t ]+)[\t ]*=)rx"),
// const std::regex key_regex_(R"rx(^([^#"'=\\r\\n\\t ]+)[\\t ]*=)rx");
// const std::regex key_regex_(R"rx(([^]+[\\t]*=))rx");
const std::regex key_regex_(R"rx(^([^#=\r\n\t ]+)[\t ]*=)rx");
const std::regex value_regex_(R"rx(^([^#"'\r\n\t ]+)[\t ]*(?:#.*){0,1}$)rx");
std::string test_regex = "booster = gbtree";
// std::string test_regex = "ba ";
std::smatch m;
std::cout << std::regex_search(test_regex,m,key_regex_) << std::endl;
std::cout << "prefix=" << m.prefix() << std::endl;
std::string key;
std::string value;
for (int i = 0; i < m.size(); ++i) {
std::cout << "sm[" << i << "]: " << m[i].str() << std::endl;
}
std::cout << "sm.prefix: " << m.suffix() << std::endl;
key = m[1].str();
value = m.suffix().str();
value = TrimWhitespace(value);
std::vector<std::pair<std::string,std::string> > v;
for (int i = 0;i < m.size(); ++i) {
v.emplace_back(key,value);
}
for (const auto &p : v){
std::cout << "first=" << p.first << "\nsecond=" << p.second << std::endl;
}
}
结果
1
prefix=
sm[0]: booster =
sm[1]: booster
sm.prefix: gbtree
first=booster
second=gbtree
first=booster
second=gbtree