双向最大匹配法:将正反最大匹配法得到的分词进行比较,按照最大匹配原则,选取词数切分最少的作为结果。
Python代码如下:
#逆向匹配
class RMM():
def __init__(self, dic_path):
self.dictionary = set()#集合(set),保证生成的词表没有重复的词语。
self.maximum = 0
# 读取词典
with open(dic_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
#lines = (line.strip() for line in f)
if not line:#判断是否为None情况
#使用if not x这种写法的前提是:必须清楚x等于None, False, 空字符串"", 0, 空列表[], 空字典{}, 空元组()时对你的判断没有影响才行
#if line is None
continue
self.dictionary.add(line)#add为集合(set)添加元素
if len(line) > self.maximum:
self.maximum = len(line)
#print(self.dictionary)
def cut(self, text):
result = []
index = len(text)
while index > 0:
word = None
for size in range(self.maximum, 0, -1):
if index - size < 0:
continue
piece = text[(index - size):index]
if piece in self.dictionary:
word = piece
result.append(word)
# word = piece
# result.append(piece)
#index = index - size
text = text[:(index - size)]
break
if word is None:
index -= 1
return result[::-1]
# 正向匹配
class LMM():
def __init__(self, dic_path):
self.dictionary = set()
self.maximum = 0
# 读取词典
with open(dic_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
self.dictionary.add(line)
if len(line) > self.maximum:
self.maximum = len(line)
def cut(self, text):
result = []
index = len(text)
while index > 0:
word = None
for size in range(0, self.maximum):
if index - size < 0:
continue
piece = text[:index - size]
if piece in self.dictionary:
word = piece
result.append(word)
text = text[(index - size):index]
break
if word is None:
index -= 1
return result
def main():
text = '南京市长江大桥'
tokenizer1 = RMM('../chinese_word.txt')
tokenizer2 = LMM('../chinese_word.txt')
res1 = tokenizer1.cut(text)
res2 = tokenizer2.cut(text)
'''
if len(res1) > len(res2):
return res2
else:
return res1
'''
# 使用条件表达式改写上面的if else语句:
#return res2 if len(res1) > len(res2) else res1
ans = res2 if len(res1) > len(res2) else res1
print(ans)
if __name__ == '__main__':
main()