def fmm(sentence,file_name='wordSet'):
"""
此函数实现正向最大匹配法分词
参数:
sentence - 待分词的原句
file_name - 词频文件
返回:
result - 已经分词的句子
"""
# 将词频文件转换为字典dic
f = open(file_name, "r")
lines = f.readlines()
f.close()
dic = {}
lines.pop(0) # 去除代表总词数的第一行
for line in lines:
word = line.split(": ")[0]
count = line.split(": ")[1].split("\n")[0]
dic[word] = count
# fmm
sentence = sentence.strip("n")
result = ""
succ = 0
# 设置最大词长
maxlength = 5
# 待切分字符串不为空
while not len(sentence) is 0:
# 取出候选子串w
w = sentence[:maxlength]
# w是否在词典
while not dic.__contains__(w):
if len(w) > 1:
w = w[:-1]
else:
break
result += w + '/'
sentence = sentence[len(w):]
return result
# test
sentence = "欧盟决定罚巨款大众不服打官司"
print(fmm(sentence, 'wordSet.txt') )
05-06
4万+
12-26
1784
01-20