目的
分析jieba如何从句子中拆分词的
预备知识
- xxx
jieba api
jieba提供三种分词方式
if cut_all: # 123
cut_block = self.__cut_all
elif HMM: # 123
cut_block = self.__cut_DAG
else: # 123
cut_block = self.__cut_DAG_NO_HMM
流程
- 建立图(dag)
- 根据图来计算如何切分
图的原理
图是如何建立的
只有在词频表里的词才会记录下来,储存上对应的index作为图
def get_DAG(self, sentence):
self.check_initialized() # 初始化一下
DAG = {} # 图其实就是一个dict[int, list[int]]
N = len(sentence)
for k in xrange(N):
tmplist = [] # 用来存储是词的索引
i = k
frag = sentence[k]
while i < N and frag in self.FREQ:
if self.FREQ[frag]: # FREQ是一个dict[str, int],这个是提前load进来的词频表,可以根据自己的需求更改
tmplist.append(i)
i += 1
frag = sentence[k:i + 1]
if not tmplist:
tmplist.append(k)
DAG[k] = tmplist
return DAG
第一种分词逻辑,直接遍历出词
def __cut_all(self, sentence):
dag = self.get_DAG(sentence)
old_j = -1
eng_scan = 0
eng_buf = u''
for k, L in iteritems(dag):
if eng_scan == 1 and not re_eng.match(sentence[k]):
eng_scan = 0
yield eng_buf
if len(L) == 1 and k > old_j:
word = sentence[k:L[0] + 1]
if re_eng.match(word):
if eng_scan == 0:
eng_scan = 1
eng_buf = word
else:
eng_buf += word
if eng_scan == 0:
yield word
old_j = L[0]
else:
for j in L:
if j > k:
yield sentence[k:j + 1]
old_j = j
if eng_scan == 1:
yield eng_buf
第二种HMM
- 也是先做一个图
- 计算一个路径,那么问题是路径是干什么的,如何计算路径
def __cut_DAG(self, sentence):
DAG = self.get_DAG(sentence)
route = {}
self.calc(sentence, DAG, route) # 这里就是计算路径
x = 0
buf = ''
N = len(sentence)
while x < N:
y = route[x][1] + 1
l_word = sentence[x:y]
if y - x == 1:
buf += l_word
else:
if buf:
if len(buf) == 1:
yield buf
buf = ''
else:
if not self.FREQ.get(buf):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else:
for elem in buf:
yield elem
buf = ''
yield l_word
x = y
if buf:
if len(buf) == 1:
yield buf
elif not self.FREQ.get(buf):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else:
for elem in buf:
yield elem
route
def calc(self, sentence, DAG, route): # route是一个dict[int, tuple[int, int]],是索引所对应的
N = len(sentence)
route[N] = (0, 0)
logtotal = log(self.total)
for idx in xrange(N - 1, -1, -1):
route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
logtotal + route[x + 1][0], x) for x in DAG[idx])