Python基于词典的正向最大匹配

最新推荐文章于 2021-02-12 07:38:20 发布

Sirius2011071417

最新推荐文章于 2021-02-12 07:38:20 发布

阅读量3.4k

点赞数

文章标签： Python 分词正向最大匹配

本文链接：https://blog.csdn.net/Sirius2011071417/article/details/51180106

版权

分词的好坏取决于词典

'''
基于词典的正向最大匹配算法
@author Sirius
'''
import re

#载入汪峰的《存在》歌词
def data():

with open('存在.txt','r',encoding='utf-8') as f:
data = f.read()

#正则表达式把多个空格换成一个空格，txt中出现的标点和特殊字符作为句子的分隔
mode_strip = re.compile(r'\n\n+')
data_strip = re.sub(mode_strip, '\n', data)
mode = re.compile(r'[，。《》\n]')
sentences = re.split(mode, data)
return sentences

#载入词典

def load_dic():
with open('dic.txt','r',encoding='utf-8') as f:
dic = f.read()
dic = dic.split('\n')
return dic

#正向最大匹配算法
def FMM(sents, MaxLen):
s1 = sents;
s2 = '';
while s1 != '':
lens = MaxLen
if len(s1) < lens:
lens = len(s1)
word = s1[:lens]
dic = load_dic()
while word not in dic:
word = word[:len(word)-1]
if len(word) == 1:
break;
s2 = s2 + word + '\\'
s1 = s1[len(word):]
return s2


if __name__ == '__main__':
print(FMM('研究生命的起源',4))
'''
result = ''
for sent in data():
s2=FMM(sent, 4)
if s2 != '':
result = result + s2 + ','
print(result[:len(result)-1])
'''