2021SC@SDUSC
cut方法
def cut(self, sentence, HMM=True):
for w in self.__cut_internal(sentence, HMM=HMM):
yield w
看到__cut_internal(sentence,HMM=True)
def __cut_internal(self, sentence, HMM=True):
# 加载用户自定义词典到默认词典
self.makesure_userdict_loaded()
sentence = strdecode(sentence)
# 拆分语料
blocks = re_han_internal.split(sentence)
# 使用HMM模型的最大概率路径寻找
if HMM:
cut_blk = self.__cut_DAG
# 不使用HMM模型
else:
cut_blk = self.__cut_DAG_NO_HMM
for blk in blocks:
if re_han_internal.match(blk):
# 将符合匹配的blk使用cut_blk寻找最大概率路径
for word in cut_blk(blk):
yield word
else:
# 对于不匹配的部分,再次拆分
tmp = re_skip_internal.split(blk)
for x in tmp:
# 再次匹配
if re_skip_internal.match(x):
# 符合匹配的返回pair对象,词汇为x,词性为’x'
yield pair(x, 'x')
else:
# 还不符合匹配
for xx in x:
# 匹配数字
if re_num.match(xx):
yield pair(xx, 'm')
# 匹配字母
elif re_eng.match(x):
yield pair(xx, 'eng')
# 都匹配不到
else:
yield pair(xx, 'x')
pair类:
class pair(object):
def __init__(self, word, flag):
self.word = word
self.flag = flag
makesure_userdict_loaded()
# 如果用户自定义词典加载了,就把加载的词典更新到word_tag_tab中
def makesure_userdict_loaded(self):
if self.tokenizer.user_word_tag_tab:
self.word_tag_tab.update(self.tokenizer.user_word_tag_tab)
self.tokenizer.user_word_tag_tab = {}