1 正向最大匹配法FMM
2 反向最大匹配法BMM
3 双向最大匹配法
4 python实现代码
5 参考文献
user_dict = ['我们', '在', '在野', '生动', '野生', '动物园', '野生动物园', '物','园','玩']
sentence = '我们在野生动物园玩'
def FMM(user_dict, sentence):
"""
Author: t4ngw
param user_dict: 用户分词词典
param sentence: 需分词句子
"""
result = []
max_length = max([len(item) for item in user_dict])
start = 0
while start != len(sentence):
index = start + max_length
if index > len(sentence):
index = len(sentence)
for i in range(index, start, -1):
if (sentence[start:i] in user_dict) or (len(sentence[start:i]) == 1):
result.append(sentence[start:i])
break
start = i
return result
def BMM(user_dict, sentence):
"""
Author: t4ngw
param user_dict: 用户分词词典
param sentence: 需分词句子
"""
result = []
max_length = max([len(item) for item in user_dict])
start = len(sentence)
while start != 0:
index = start - max_length
if index < 0:
index = 0
for i in range(index, start):
if (sentence[i:start] in user_dict) or (len(sentence[i:start]) == 1):
result.append(sentence[i:start])
break
start = i
return result
def Twoway_maximum_match(user_dict, sentence):
"""
Author: t4ngw
param user_dict: 用户分词词典
param sentence: 需分词句子
"""
FMM_ = FMM(user_dict, sentence)
BMM_ = BMM(user_dict, sentence)
if (len(FMM_)) != (len(BMM_)):
if (len(FMM_)) <= (len(BMM_)):
return FMM_
else:
return BMM_
else:
FMM_single = 0
BMM_single = 0
for i in range(len(FMM_)):
if len(FMM_[i]) == 1:
FMM_single += 1
for j in range(len(BMM_)):
if len(FMM_[i]) == 1:
BMM_single += 1
if FMM_single > BMM_single:
return BMM_
else:
return FMM_
Twoway_maximum_match(user_dict, sentence)
out[1]: ['玩', '野生动物园', '在', '我们']