Python实现逆向最大匹配中文分词,注意的是作为词典的文件编码格式应该为utf-8.
# -*- coding: utf-8 -*-
# BMM
# 使用逆向最大匹配算法实现中文分词
dic = []
def init():
"""
读文件
获取中文词典
:return:
"""
input = open("test.txt")
lines = input.readlines()
for line in lines:
line = line.decode("utf8")
temp = line.split(',')
dic.append(temp[0])
# for d in dic:
# print d
def if_contain(words):
"""
判断当前词在词典中是否存在
:param words:
:return:
"""
words = words.decode("utf8")
flag = False
for d in dic:
if d == words:
flag = True
break
return flag
def spl(sentence):
"""
逆向最大匹配算法的主要实现部分
从后向前切割字符串,直到切割出的子串与词典中的词匹配
:param sentence:
:return: