分词的好坏取决于词典
'''
基于词典的正向最大匹配算法
@author Sirius
'''
import re
#载入汪峰的《存在》歌词
def data():
with open('存在.txt','r',encoding='utf-8') as f:
data = f.read()
#正则表达式把多个空格换成一个空格,txt中出现的标点和特殊字符作为句子的分隔
mode_strip = re.compile(r'\n\n+')
data_strip = re.sub(mode_strip, '\n', data)
mode = re.compile(r'[,。《》\n]')
sentences = re.split(mode, data)
return sentences
#载入词典
def load_dic():
with open('dic.txt','r',encoding='utf-8') as f:
dic = f.read()
dic = dic.split('\n')
return dic
#正向最大匹配算法
def FMM(sents, MaxLen):
s1 = sents;
s2 = '';
while s1 != '':
lens = MaxLen
if len(s1) < lens:
lens = len(s1)
word = s1[:lens]
dic = load_dic()
while word not in dic:
word = word[:len(word)-1]
if len(word) == 1:
break;
s2 = s2 + word + '\\'
s1 = s1[len(word):]
return s2
if __name__ == '__main__':
print(FMM('研究生命的起源',4))
'''
result = ''
for sent in data():
s2=FMM(sent, 4)
if s2 != '':
result = result + s2 + ','
print(result[:len(result)-1])
'''