以前用C++实现的RMM分词类型

中文分词中用到的RMM( 逆向最大匹配算法)算法。不说那么多了,我不是研究算法的,总之RMM这种基于词典的分词算法,准确率比较高就是了。我实现的这个方法,会把非词典的内容,全部拆分成单字。写了很多年了,忘记了很多,直接贴代码:

#include <map>
#include <vector>
#include <list>
#include <string>
#include <algorithm>
#include "tools.h"
using namespace std;


#ifndef PARTH
#define PARTH

#define _ARRAY_COUNT_(x) sizeof(x)/sizeof(*x)
#define _WPRINTF_(c) setlocale(LC_ALL,"chs"); wprintf( c ); cout << endl;
#define _WVECTOR_ vector<wstring>

void CreateVectorForArray(const wstring *arr, const int arr_len, vector<wstring> &vec);
void StrReplace(wstring &str, const wstring find, const wstring replace);

#define _WCreateVectorForArray_(src,target) CreateVectorForArray(src,_ARRAY_COUNT_(src),target);
#define _WInVectory(str, vec) WInVectory(str,vec)

void CreateVectorForArray(const wstring *arr, const int arr_len, vector<wstring> &vec){
vector<wstring> _tmp( arr, arr+arr_len );
vec.resize(arr_len);
copy(_tmp.begin(), _tmp.end(), vec.begin());
}


void StrReplace(wstring &str, const wstring find, const wstring replace){
string::size_type pos = 0;
while ( (pos = str.find(find, pos)) != wstring::npos ) {
str.replace( pos, find.size(), replace );
pos++;
}
}

 
typedef struct WordPartResultItem{
int count;
int startOffset;
int endOffset;
int sort;
float scale;
WordPartResultItem():count(0),startOffset(0),endOffset(0),scale(0),sort(0){};
} _WordPartResultItem;



typedef WordPartResultItem _WordPartResult;



class WordPart{
public:
WordPart( wstring word[], int word_len);
void SetContent( wstring content);

//分词时是否允许将每个单字分割词
bool is_single;

//执行分词
map<wstring,_WordPartResult> Part(); 

private:

//用于分词的词典
map<int,list<wstring>> words;
//词典总数
int words_length;

//将要分词的内容
wstring content;

//最大匹配词的长度
int max_length;

//忽略的字符
_WVECTOR_ trim_str;
//断句符号
_WVECTOR_ end_dot;

//获取最大匹配词的长度
int GetMaxLength(wstring *words);

void setResult( map<wstring,_WordPartResult> &result, int &sort, const wstring k, const int offset );

};

#endif;


WordPart::WordPart( wstring word[], int word_len){
//this->words = word;
this->words_length = word_len;
this->is_single = false;
max_length = GetMaxLength(word);
wstring trim_tmp[] = {L"\"",L"'",L"‘",L"’",L"“",L"”",L"\\",L"(",L")",L"(",L")"};
wstring end_tmp[] = {L".",L",",L"。",L",",L":",L":",L" "};
_WCreateVectorForArray_(trim_tmp,trim_str)
_WCreateVectorForArray_(end_tmp,end_dot)
//整理词典
for( int i=0; i<word_len; i++){
int k = word[i].size();
if( words.find(k) == words.end() ){
list<wstring> v;
words.insert( map<int,list<wstring>>::value_type(k,v) );
}
words[k].push_back(word[i]);
}
}

void WordPart::SetContent( wstring content){
_WVECTOR_::iterator iter;
for( iter=trim_str.begin(); iter!=trim_str.end(); iter++){
StrReplace(content,*iter,L" ");
}
this->content = content;
}

int WordPart::GetMaxLength(wstring *words){
int size = 0;
for( int i=0; i<words_length; i++ ){
if( words[i].length() > size )
size = words[i].length();
}

return size;
}


map<wstring,_WordPartResult> WordPart::Part(){

map<wstring,_WordPartResult> result;
if( content.empty() || words_length == 0 )
return result;
//如果要求切分单字
if( is_single ){
wstring cstr;
wstring eng;
int sort = 0;
for( int i=0; i<content.size(); i++ ){
cstr = content.substr(i,1);
if( (cstr < L"a" || cstr > L"z") && (cstr < L"A" || cstr > L"Z") && (cstr < L"0" || cstr > L"9") ){
if(!eng.empty()){
if( words.find(eng.size()) == words.end() 
|| find( words[eng.size()].begin(), words[eng.size()].end(), eng ) == words[eng.size()].end() )
setResult(result,sort,eng,i-eng.size());
eng.clear();
}
if( words.find(1) == words.end() 
|| find( words[1].begin(), words[1].end(), cstr ) == words[1].end() )
setResult(result,sort,cstr,i);

}else{
eng.append(cstr);
}
}
if(!eng.empty()){
if( words.find(eng.size()) == words.end() 
|| find( words[eng.size()].begin(), words[eng.size()].end(), eng ) == words[eng.size()].end() )
setResult(result,sort,eng,content.size()-eng.size());
eng.clear();
}

}

_WVECTOR_ line;
wstring one_line = L"";

//段句
for( int i=0; i<content.size(); i++ ){
wstring str = content.substr( i, 1 );
if( find(end_dot.begin(),end_dot.end(),str) == end_dot.end() ){
one_line.append(str);
}else{
line.insert(line.begin(),one_line);
one_line.clear();
}
if( i==content.size() - 1 && !one_line.empty() ){
line.insert(line.begin(),one_line);
}
}
int content_size = content.size(), offset = content_size, sort = 0;

for(_WVECTOR_::iterator i=line.begin(); i != line.end(); i++){
offset -= (*(i)).size();
if( i != line.begin() ){
offset -= 1; //断句符号位置
}
//分词
wstring str = *i; //段内容
wstring cstr; //分段内容
int begin = 0,pointer; //游标,一个全局游标,和一个相对游标
int k; //词典键
int sublength = max_length; //每个查询字符的长度
if( str.size() > max_length ){
begin = str.size() - max_length;
}else{
sublength = str.size();
}
while (begin != -sublength)
{
pointer = 0;
while( pointer < sublength ){
if( begin >= 0 ){
cstr = str.substr( begin, sublength-pointer );
k = cstr.size();
if( words.find(k) != words.end() && find(words[k].begin(),words[k].end(),cstr)!=words[k].end() ){
setResult(result,sort,cstr,offset+begin);
break;
}
}
pointer++;
if( pointer < sublength )
begin++;
}
begin-=sublength;
}
}
return result;
}


void WordPart::setResult( map<wstring,_WordPartResult> &result, int &sort, const wstring k, const int offset ){
if( result.find(k) == result.end() ){
_WordPartResultItem row;
result.insert( map<wstring,_WordPartResult>::value_type(k,row) );
result[k].startOffset = offset;
result[k].sort = sort;
sort++;
}
result[k].count += 1;
result[k].endOffset = offset;
}  
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值