原理: 将模式串与匹配串都转成unicode编码,再用正则。 可以用python完成,或者是用c++boost
方案一,解析程序C版本,中间调用python函数,python函数中调用正则表达式进行函数解析。 本地可以运行,但是haoop集群运行不了。
方案二,采用boost wregex C++源码编译boost库。
备注: cpp文件都UTF-8编码
方案一代码:
#-*-coding:UTF-8-*-
import re;
import sys;
import time;
def add(a,b):
s="";
try:
upatternstr=unicode(a,'UTF-8');
except:
pass;
pchinese=re.compile(upatternstr);
try:
uline = unicode(b,"UTF-8");
mylist = [];
index = 0;
while True:
m=pchinese.search(uline,index);
if (m!=None):
mylist.append(m.group(1).encode("UTF-8"));
index =m.end();
else:
break;
s="\t".join(mylist);
return s;
except:
return s;
if (__name__=="__main__"):
t="<li><span>字义:</span>(.*?)</li>";
fid=open("qiming2.txt","r");
s=fid.read();
fid.close();
add(t,s);
char line[102400]={0};
char text[102400]={0};
char pattern[200]={0};
strcpy(pattern,t.c_str());
while(fgets(line,102400,stdin))
{
//text.assign(line);
//wstring wtext = String2Wstringx(t);
//wstring::const_iterator it=wtext.begin();
// wstring::const_iterator end=wtext.end();
//while(boost::regex_search(it,end,wm,wreg))
// {
// wstring wtemp=wm[1];
// string temp=Wstring2String(wtemp);
// results.push_back(temp);
// it=wm[1].second;
//}
strcat(text,line);
strcat(text,"\n");
}
//string t="刘[^刘]*?,";
//wstring ws=String2Wstring(s);
//cout<<p.size()<<endl;
//cout<<ws.size()<<endl;
//fprintf(stdout,"输出正则匹配结果\n");
//for(vector<string>::iterator it=results.begin();it!=results.end();it++)
//{
// printf("%s\n",(*it).c_str());
//}
Py_Initialize();
// 检查初始化是否成功
if ( !Py_IsInitialized() )
{
return -1;
}
// 添加当前路径
//把输入的字符串作为Python代码直接运行,返回0
//表示成功,-1表示有错。大多时候错误都是因为字符串
//中有语法错误。
PyRun_SimpleString("import sys");
PyRun_SimpleString("sys.path.append('./')");
PyObject *pName,*pModule,*pDict,*pFunc,*pArgs, *ret;
// 载入名为pytest的脚本
pName = PyString_FromString("pytest");
pModule = PyImport_Import(pName);
if ( !pModule )
{
printf("can't find pytest.py");
return -1;
}
pDict = PyModule_GetDict(pModule);
if ( !pDict )
{
return -1;
}
// 找出函数名为add的函数
pFunc = PyDict_GetItemString(pDict, "add");
if ( !pFunc || !PyCallable_Check(pFunc) )
{
printf("can't find function [add]");
return -1;
}
// 参数进栈
*pArgs;
pArgs = PyTuple_New(2);
// PyObject* Py_BuildValue(char *format, ...)
// 把C++的变量转换成一个Python对象。当需要从
// C++传递变量到Python时,就会使用这个函数。此函数
// 有点类似C的printf,但格式不同。常用的格式有
// s 表示字符串,
// i 表示整型变量,
// f 表示浮点数,
// O 表示一个Python对象。
PyTuple_SetItem(pArgs, 0, Py_BuildValue("s",pattern));
PyTuple_SetItem(pArgs, 1, Py_BuildValue("s",text));
// 调用Python函数
ret=PyObject_CallObject(pFunc, pArgs);
char * str_ret = PyString_AsString(ret);
printf("result:%s\n", str_ret);
Py_DECREF(pName);
Py_DECREF(pArgs);
Py_DECREF(pModule);
// 关闭Python
Py_Finalize();
gettimeofday(&tv2, NULL);
fprintf(stderr,"%s has finished congratulations!\n",argv[0]);
fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
return 0;
方法二
// please add your code here!
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include<time.h>
#include <set>
#include <string>
#include <sys/time.h>
#include<locale.h>
#include<boost/regex.hpp>
#include <wchar.h>
#include <iconv.h>
#include <errno.h>
using namespace std;
/*
funcname:
spec:
parms:[IN]
[IN]
[OUT]
returnValue:
author liuyu, 20120528
*/
void PrintUsage()
{
fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" );
}
int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)
{
iconv_t convertor=iconv_open(toCode,fromCode);
size_t inputsize;
size_t outputsize;
size_t oldoutputsize;
char *input, *inputold;
char *output=NULL;
char *outputold=NULL;
int flag=0;
if(convertor==iconv_t(-1))
{
fprintf(stderr,"convertor device initailization failed!\n");
return 1;
}
else
{
inputsize=srclen;
input=new char[inputsize+1];
memcpy(input,srcstr,inputsize);
input[inputsize]='\0';
inputold=input;
outputsize=inputsize*5;
oldoutputsize=outputsize;
output=new char[outputsize];
output[0]=0;
outputold=output;
size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);
if (rc==size_t(-1))
{
fprintf(stdout, "errno=%d\n",errno);
}
destlen=oldoutputsize-outputsize;
memcpy(deststr,outputold,destlen);
deststr[destlen]=0;
if(rc!=size_t(-1))
{
flag=1;
}
delete []inputold;
delete []outputold;
}
iconv_close(convertor);
if(flag==1)
{
return 0;
}
else
{
return 1;
}
}
wchar_t * MBs2WCs(const char* pszSrc){
wchar_t* pwcs = NULL;
int size = 0;
setlocale(LC_ALL, "zh_CN.UTF8");
size = mbstowcs(NULL,pszSrc,0);
pwcs = new wchar_t[size+1];
size = mbstowcs(pwcs, pszSrc, size+1);
pwcs[size] = 0;
return pwcs;
}
char* WCs2MBs(const wchar_t * wcharStr){
char* str = NULL;
int size = 0;
setlocale(LC_ALL, "zh_CN.UTF8");
size = wcstombs( NULL, wcharStr, 0);
str = new char[size + 1];
wcstombs( str, wcharStr, size);
str[size] = '\0';
return str;
}
int main( int argc, char *argv[] )
{
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
if ( 1 != argc )
{
PrintUsage();
return 1;
}
/*
char *s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi";
char *t="(刘[^刘]*?),";
wchar_t *ws =MBs2WCs(s);
wchar_t *wt =MBs2WCs(t);
wstring wstr1=ws;
wstring wstr2=wt;
boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);
boost::wsmatch wm;
wstring::const_iterator it=wstr1.begin();
wstring::const_iterator end=wstr1.end();
while(boost::regex_search(it,end,wm,wreg))
{
wstring wtemp=wm[1];
char* temp=WCs2MBs(wtemp.c_str());
printf("%s\n",temp);
it=wm[0].second;
}
*/
char line[102400]={0};
char text[102400]={0};
char* t="<li><span>字义:</span>(.*?)</li>";
wchar_t *wt =MBs2WCs(t);
boost::wsmatch wm;
boost::wregex wreg(wt,boost::regbase::icase|boost::regex::perl);
while(fgets(line,102400,stdin))
{
strcat(text,line);
}
wchar_t * ws = MBs2WCs(text);
wstring wtext=ws;
wstring::const_iterator it=wtext.begin();
wstring::const_iterator end=wtext.end();
vector<string> results;
while(boost::regex_search(it,end,wm,wreg))
{
wstring wtemp=wm[1];
char* temp=WCs2MBs(wtemp.c_str());
results.push_back(temp);
it=wm[1].second;
}
for (vector<string>::iterator it = results.begin(); it!=results.end(); it++)
{
fprintf(stdout,"%s\n",(*it).c_str());
}
gettimeofday(&tv2, NULL);
fprintf(stderr,"%s has finished congratulations!\n",argv[0]);
fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
return 0;
}
方法一的编译方法:
g++ Python.cpp -o Python -I/usr/include/python2.5 -L/usr/lib/python2.5 -lpython2.5