用则表达式提取数据

最新推荐文章于 2024-06-17 19:39:11 发布

东方匠心

最新推荐文章于 2024-06-17 19:39:11 发布

阅读量655

点赞数

分类专栏：Ｃ++

本文链接：https://blog.csdn.net/tastelife/article/details/19198299

版权

Ｃ++ 专栏收录该内容

55 篇文章 0 订阅

订阅专栏

取字符串中的<>内的数据,但是还是怎样不包含<还是不会啊

做了测试发现用正则表达式会慢

回复(6) iuhanlcj: (?=<).+(?<=>)

删除 | 2014-2-13 18:55 回复
baoming9999: 回复 liuhanlcj :不行，会异常啊，对正则表达式基本上仅在C#中用输入限制时用过，实在是头疼啊
删除 | 2014-2-13 20:46 回复
liuhanlcj: 回复 baoming9999 :不要用C#的，要用一些第三方库，C#的那个对断言支持不好。
删除 | 2014-2-13 21:44 回复
baoming9999: 回复 liuhanlcj :本想用std::regex的
删除 | 2014-2-13 21:47 回复
liuhanlcj: 回复 baoming9999 :那个绝对不要用，负向零宽断言会异常。如果你不要匹配中文，那就pcre，如果要匹配中文，那就google 的 re2.当然，好的有很多，我只是建议。
删除 | 2014-2-13 21:48 回复
- baoming9999: 回复 liuhanlcj :谢谢
  删除 | 2014-2-13 21:54 回复
我也说一句

还有1条回复，点击查看

baoming9999
亮出21CM

6

// ConsoleApplication1.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <iostream>
#include <sstream>
#include <string>
#include <regex>
#include <Windows.h>
using namespace std;
template<typename LT, typename E>
int Extraction(std::basic_string<E> src, E extFlagBegin, E extFlagEnd, LT& extData, bool isIncludeFlag=false)
{
int rtn=0;
std::basic_string<E> temp;
bool isStart = false;
for(auto it=src.begin(); it!=src.end(); ++it)
{
//已开始提取
if(isStart)
{
//提取结束
if(*it==extFlagEnd)
{
isStart = false;
//包含标识符
if(isIncludeFlag)
{
temp.push_back(*it);
}
extData.push_back(temp);
temp.clear();
rtn++;
continue;
}
temp.push_back(*it);
}
//未开始提取
else
{
//提取开始
if(*it==extFlagBegin)
{
//包含标识符
if(isIncludeFlag)
{
temp.push_back(*it);
}
isStart = true;
}
}
}
return rtn;
}

template<typename LT, typename E>
int ExtractionRegex(std::basic_string<E> src, E extFlagBegin, E extFlagEnd, LT& extData, bool isIncludeFlag=false)
{
basic_stringstream<E, char_traits<E>, allocator<E> > ss;
ss<< extFlagBegin <<"[^" << extFlagEnd << "]*" << extFlagEnd;
std::regex pattern(ss.str());
std::sregex_token_iterator end;
int count=0;
for (std::sregex_token_iterator it(src.begin(),src.end(), pattern); it != end ; ++it)
{
std::basic_string<E> temp = *it;
//不包含标识符
if(!isIncludeFlag)
{
temp.erase(0, 1);
temp.erase(temp.size()-1, 1);
}
extData.push_back(temp);
count++;
}
return count;
}
int _tmain(int argc, _TCHAR* argv[])
{
std::string str = "jay@so<hj>u<>t<hr>idg<evnnnnnni av>deo@com";
std::vector<string> vt;
vt.clear();
ExtractionRegex(str,'<','>',vt);
std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
vt.clear();
ExtractionRegex(str,'<','>',vt,true);
std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
vt.clear();
Extraction(str,'<','>',vt);
std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
vt.clear();
Extraction(str,'<','>',vt,true);
std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
int ts = 1000;
clock_t start, finish;
start = clock();
for(int i=0; i<ts; i++)
{
vt.clear();
ExtractionRegex(str,'<','>',vt);
//std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
vt.clear();
ExtractionRegex(str,'<','>',vt,true);
//std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
}

finish = clock();
std::cout << (double)(finish - start) / CLOCKS_PER_SEC << endl;

vt.clear();
start = clock();
for(int i=0; i<ts; i++)
{
vt.clear();
Extraction(str,'<','>',vt);
//std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
vt.clear();
Extraction(str,'<','>',vt,true);
//std::for_each(vt.begin(),vt.end(),[](std::string& d){cout<<d<<endl;});
}

finish = clock();
std::cout << (double)(finish - start) / CLOCKS_PER_SEC << endl;
system("pause");
return 0;
}

森哥先森

亮出24CM

9

提取数据后，用copy之类的截取不要部分？效率略低，但是只要这不是瓶颈代码就没问题。。。

回复(5) 收起回复

5楼
2014-02-13 20:48

删除 |

baoming9999: 上面的调用正则用时：3.466，自己写的0.298，在这种使用情况下是不能用了
删除 | 2014-2-13 20:58 回复
森哥先森: 回复 baoming9999 :但是你的代码不会大部分都在处理正则吧？如果是，当我没说过。。。
删除 | 2014-2-13 21:05 回复
baoming9999: 回复森哥先森 :互相讨论吧，用户的每一个操作，花费的时间不应该超过1.5秒，最好0.8秒内，1000条的数据处理还是很常见的，还要有别的处理，所以对多条数据处理时，每一个函数都要斟酌下了
删除 | 2014-2-13 21:24 回复
森哥先森: 回复 baoming9999 :好吧，我写的是人工智能（下棋）程序，主要时间都在上面，一运行跑数小时，除了核心代码，其他地方怎么写都没问题，就算占用一分钟，依然没问题。。。
删除 | 2014-2-13 21:26 回复
baoming9999: 回复森哥先森 :嗯，这就是需求不同
删除 | 2014-2-13 21:35 回复
我也说一句

sunchy12345678
小吧主

11

前向断言后向断言标准好像没有规定？
boost::xpressive似乎支持这样的语法
类似(?=<)[^>]*(?<=>)这样。

使用pcre 带不带分隔符各1000条 0.062秒

// ConsoleApplication1.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include <iostream>
#include <sstream>
#include <string>
#include <regex>

#include <Windows.h>

#define PCRE_STATIC
#include "pcre.h"
#pragma comment (lib, "pcre.lib")
#pragma comment (lib, "pcreposix.lib")

using namespace std;

class WXRegex
{

#define OVECCOUNT 30 /* should be a multiple of 3 */

pcre *re;

public:
    WXRegex() : re(nullptr)
    {
        if(re != nullptr)
        {
            free(re);
        }
    }
    ~WXRegex()
    {
    }
    void Compile(const char* pattern="(?<=<)[^>]*(?=>)")
    {
        int             erroffset;
        const char      *error;

        //释放以前的
        if(re != nullptr)
        {
            free(re);
        }

        re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
        if (re == NULL)
        {
            throw exception(error);
        }
    }
    int Extraction(const char* src, std::vector<string>& out)
    {
        int count= 0;

        int exec_offset = 0;
        int rc=0;
        int ovector[OVECCOUNT];
        const char *captured_string;

        do {
            rc = pcre_exec(re, NULL, src, strlen(src), exec_offset, 0, ovector, OVECCOUNT);
            if (rc < 0)
            {
                break;
            }
            for (int i = 0; i < rc; i++)
            {
                pcre_get_substring( src, ovector, rc, 0, &captured_string );
                out.push_back(captured_string);
            }

exec_offset = ovector[1];

count++;

} while ( rc > 0 );

return count;
}
};

int _tmain(int argc, _TCHAR* argv[])
{
    std::string str = "<sd>< jay@>so<hj>ut<h在r>i在工城某工dg<evnnn厅nnni av>deo@com";

    WXRegex wxr;
    wxr.Compile("<[^>]*>");
    std::vector<string> out;

    int /*c = wxr.Extraction(str.c_str(), out);
    for_each(out.begin(),out.end(), [](string& c){ std::cout << c <<endl;});
    out.clear();*/
    c = wxr.Extraction(str.c_str(), out);
    for_each(out.begin(),out.end(), [](string& c){ std::cout << c <<endl;});

    clock_t start, finish;
    start = clock();
    for(int i=0;i<1000;i++)
    {
        out.clear();
        c = wxr.Extraction(str.c_str(), out);
    }

    wxr.Compile("(?<=<)[^>]*(?=>)");
    for(int i=0;i<1000;i++)
    {
        out.clear();
        c = wxr.Extraction(str.c_str(), out);
    }

    out.clear();
    c = wxr.Extraction(str.c_str(), out);
    for_each(out.begin(),out.end(), [](string& c){ std::cout << c <<endl;});

finish = clock();
std::cout << (double)(finish - start) / CLOCKS_PER_SEC << endl;

system("pause");

return 0;
}