C++正则表达式处理中文-CSDN博客

本文链接：https://blog.csdn.net/qq_35886593/article/details/105536971

boost:

问题的提出：

Boost.Regex作为Boost对正则表达式的实践，是C++开发中常用模式匹配工具。但在这次使用过程中发现，它他对中文的支持并不好。当我们指定/w匹配时，包含“数”或“节”等字的字符串就会出现匹配失败的问题。

解决方案：

思路：把字符都转换成宽字符，然后再匹配。
需要用到以下和宽字符有关的类：
1、wstring：
作为STL中和string相对应的类，专门用于处理宽字符串。方法和string都一样，区别是value_type是wchar_t。wstring类的对象要赋值或连接的常量字符串必须以L开头标示为宽字符。
2、wregex：
和regex相对应，专门处理宽字符的正则表达式类。同样可以使用regex_match()和regex_replace()等函数。regex_match()的结果需要放在wsmatch类的对象中。
字符和宽字符的相互转换：
1、RTL的方法
//把字符串转换成宽字符串
    setlocale( LC_CTYPE, "" );  // 很重要，没有这一句，转换会失败。
    int iWLen= mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() );  // 计算转换后宽字符串的长度。（不包含字符串结束符）
    wchar_t *lpwsz= new wchar_t[iWLen+1];
    int i= mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() );  // 转换。（转换后的字符串有结束符）
    wstring wsToMatch(lpwsz);
    delete []lpwsz;
//把宽字符串转换成字符串，输出使用
    int iLen= wcstombs( NULL, wsm[1].str().c_str(), 0 ); // 计算转换后字符串的长度。（不包含字符串结束符）
    char *lpsz= new char[iLen+1];
    int i= wcstombs( lpsz, wsm[1].str().c_str(), iLen ); // 转换。（没有结束符）
    lpsz[iLen] = '/0';
    string sToMatch(lpsz);
    delete []lpsz;
2、Win32 SDK的方法
//把字符串转换成宽字符串
    int iWLen= MultiByteToWideChar( CP_ACP, 0, sToMatch.c_str(), sToMatch.size(), 0, 0 ); // 计算转换后宽字符串的长度。（不包含字符串结束符）
    wchar_t *lpwsz= new wchar_t [iWLen+1];
    MultiByteToWideChar( CP_ACP, 0, sToMatch.c_str(), sToMatch.size(), lpwsz, iWLen ); // 正式转换。
    wsz[iWLen] = L'/0';
//把宽字符串转换成字符串，输出使用
    int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。（包含字符串结束符）
    char *lpsz= new char[iLen];
    WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
    sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。

示例：

通过以下程序我们可以看到，对字符串做/w匹配时，某些字会引起匹配失败。通过把字符串转换成宽字符串尝试解决这个问题。

#include <iostream>
using std::cout;
using std::endl;
#include <string>
using std::string;
using std::wstring;
#include <locale>

#include "boost/tr1/regex.hpp"
using namespace boost;

void MatchWords(string sToMatch)
{
    regex rg("(//w*)");
    smatch sm;
    regex_match( sToMatch, sm, rg );
    cout << "匹配结果：" << sm[1].str() << endl;
}

void MatchWords(wstring wsToMatch)
{
    wregex wrg(L"(//w*)");
    wsmatch wsm;
    regex_match( wsToMatch, wsm, wrg );

    int iLen= wcstombs( NULL, wsm[1].str().c_str(), 0 );
    char *lpsz= new char[iLen+1];
    int i= wcstombs( lpsz, wsm[1].str().c_str(), iLen );
    lpsz[iLen] = '/0';

    string sToMatch(lpsz);
    delete []lpsz;
    cout << "匹配结果：" << sToMatch << endl;
}

void main()
{
    string sToMatch("数超限");
    MatchWords( sToMatch );
    sToMatch = "节点数目超限";
    MatchWords( sToMatch );

    setlocale( LC_CTYPE, "" );
    int iWLen= mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() );
    wchar_t *lpwsz= new wchar_t[iWLen+1];
    int i= mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() );

    wstring wsToMatch(lpwsz);
    delete []lpwsz;
    MatchWords( wsToMatch );
}

编译执行程序后输出：
   匹配结果：数超限
    匹配结果：
    匹配结果：节点数目超限
第一行显示“数超限”匹配成功。但第二行“节点数超限”没有匹配到任何字符。只有转换成宽字符串之后才能够对“节点数超限”成功进行/w匹配。
————————————————

std：

#include <iostream>
#include <regex>
#include <tchar.h> //_T
using namespace std;


int main()
{
    /*string s0 = "学正楼";
    cout << s0[0] ;
    cout << s0[1] << endl;
*/

    //wcout.imbue(locale("chs", locale::ctype));  //据说有平台问题
    //wstring s1 = L"学正楼";
    //wcout << s1[0];

    std::locale loc("");
    std::wcout.imbue(loc);

    std::wstring text(L"我的IP地址是:109.168.0.1.");
    std::wstring newIP(L"127.0.0.1");
    std::wstring regString(L"(\\d+)\\.(\\d+)\\.(\\d+)\\.(\\d+)");

    // 表达式选项 - 忽略大小写  
    std::regex_constants::syntax_option_type fl = std::regex_constants::icase;

    // 编译一个正则表达式语句  
    std::wregex regExpress(regString, fl);

    // 保存查找的结果  
    std::wsmatch ms;

    // 判断是否全行匹配  
    if (std::regex_match(text, ms, regExpress))
    {
        std::wcout << L"正则表达式:" << regString << L"匹配:" << text << L"成功." << std::endl;
    }
    else
    {
        std::wcout << L"正则表达式:" << regString << L"匹配:" << text << L"失败." << std::endl;
    }

    // 查找  
    if (std::regex_search(text, ms, regExpress))
    {
        std::wcout << L"正则表达式:" << regString << L"查找:" << text << L"成功." << std::endl;
        for (size_t i = 0; i < ms.size(); ++i)
        {
            std::wcout << L"第" << i << L"个结果:\"" << ms.str(i) << L"\" - ";
            std::wcout <<L"起始位置:" << ms.position(i) << L"长度" << ms.length(i) << std::endl;
        }
        std::wcout << std::endl;

        // 替换1  
        text = text.replace(ms[0].first, ms[0].second, newIP);
        std::wcout << L"替换1后的文本:" << text << std::endl;
    }
    else
    {
        std::wcout << L"正则表达式:" << regString << L"查找:" << text << L"失败." << std::endl;
    }

    // 替换2  
    newIP = L"255.255.0.0";
    std::wstring newText = std::regex_replace(text, regExpress, newIP);
    std::wcout << L"替换2后的文本:" << newText << std::endl;

    // 结束  
    std::wcout << L"按回车键结束...";
    std::wcin.get();
    return 0;


    getchar();
    return 0;
}

#include "regex.hpp"
#include <regex>
#include <string>
#include <vector>
#include <iostream>

int test_regex_match()
{
    std::string pattern{ "\\d{3}-\\d{8}|\\d{4}-\\d{7}" }; // fixed telephone
    std::regex re(pattern);

    std::vector<std::string> str{ "010-12345678", "0319-9876543", "021-123456789"};

    /* std::regex_match:
        判断一个正则表达式(参数re)是否匹配整个字符序列str,它主要用于验证文本
        注意，这个正则表达式必须匹配被分析串的全部，否则返回false;如果整个序列被成功匹配，返回true
    */

    for (auto tmp : str) {
        bool ret = std::regex_match(tmp, re);
        if (ret) fprintf(stderr, "%s, can match\n", tmp.c_str());
        else fprintf(stderr, "%s, can not match\n", tmp.c_str());
    }

    return 0;
}

int test_regex_search()
{
    std::string pattern{ "http|hppts://\\w*$" }; // url
    std::regex re(pattern);

    std::vector<std::string> str{ "http://blog.csdn.net/fengbingchun", "https://github.com/fengbingchun",
        "abcd://124.456", "abcd https://github.com/fengbingchun 123" };

    /* std::regex_search:
        类似于regex_match,但它不要求整个字符序列完全匹配
        可以用regex_search来查找输入中的一个子序列，该子序列匹配正则表达式re
    */

    for (auto tmp : str) {
        bool ret = std::regex_search(tmp, re);
        if (ret) fprintf(stderr, "%s, can search\n", tmp.c_str());
        else fprintf(stderr, "%s, can not search\n", tmp.c_str());
    }

    return 0;
}

int test_regex_search2()
{
    std::string pattern{ "[a-zA-z]+://[^\\s]*" }; // url
    std::regex re(pattern);

    std::string str{ "my csdn blog addr is: http://blog.csdn.net/fengbingchun , my github addr is: https://github.com/fengbingchun " };
    std::smatch results;
    while (std::regex_search(str, results, re)) {
        for (auto x : results)
            std::cout << x << " ";
        std::cout << std::endl;
        str = results.suffix().str();
    }

    return 0;
}

int test_regex_replace()
{
    std::string pattern{ "\\d{18}|\\d{17}X" }; // id card
    std::regex re(pattern);

    std::vector<std::string> str{ "123456789012345678", "abcd123456789012345678efgh",
        "abcdefbg", "12345678901234567X" };
    std::string fmt{ "********" };

    /* std::regex_replace:
        在整个字符序列中查找正则表达式re的所有匹配
        这个算法每次成功匹配后，就根据参数fmt对匹配字符串进行替换
    */

    for (auto tmp : str) {
        std::string ret = std::regex_replace(tmp, re, fmt);
        fprintf(stderr, "src: %s, dst: %s\n", tmp.c_str(), ret.c_str());
    }

    return 0;
}

int test_regex_replace2()
{
    // reference: http://www.cplusplus.com/reference/regex/regex_replace/
    std::string s("there is a subsequence in the string\n");
    std::regex e("\\b(sub)([^ ]*)");   // matches words beginning by "sub"

    // using string/c-string (3) version:
    std::cout << std::regex_replace(s, e, "sub-$2");

    // using range/c-string (6) version:
    std::string result;
    std::regex_replace(std::back_inserter(result), s.begin(), s.end(), e, "$2");
    std::cout << result;

    // with flags:
    std::cout << std::regex_replace(s, e, "$1 and $2", std::regex_constants::format_no_copy);
    std::cout << std::endl;

    return 0;
}

正则表达式使用