因为水平一般,所以将jieba库中分词模块的代码进行简化,这里全部使用函数实现
import jieba
from jieba.finalseg.prob_emit import P as emit_p
from jieba.finalseg.prob_start import P as start_p
from jieba.finalseg.prob_trans import P as trans_p
import os
from math import log
import re
init_dir = jieba.__file__
jieba_dir = os.path.dirname(init_dir)
dict_dir = jieba_dir + r"\dict.txt"
re_eng = re.compile("[a-zA-Z0-9]", re.U)
re_han_default = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#\._]+)", re.U)# 汉字码、非空白字符
re_skip_default = re.compile("(\r\n|\s)", re.U) # 换行或空白
re_han_cut_all = re.compile("([\u4E00-\u9FA5]+)", re.U)# 全局模式 只包含汉字
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)# 只保留字母数字+#
re_skip_vit = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")# 匹配数字(包含小数)或字母数字
route = {}
text_type = str
MIN_FLOAT = -3.14e100
states = "BMES"
PrevStatus = {
'B': 'ES',
'M': 'MB',
'S': 'SE',
'E': 'BM'
}
def strdecode(sentence):
if not isinstance(sentence, text_type): # 非Unicode
try:
sentence = sentence.decode('utf-8') # utf-8解码为Unicode
except UnicodeDecodeError: # UnicodeDecodeError则用gbk解码为Unicode
sentence = sentence.decode('gbk', 'ignore') # 设置为ignore,则会忽略非法字符
return sentence
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] # 状态概率矩阵
path = {}
for y in states: # 初始化状态概率
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
path[y] = [y] # 记录路径
for t in range(1, len(obs)):
V.append({})
newpath = {}
for y in states:
em_p = emit_p[y].get(obs[t], MIN_FLOAT)
# t时刻状态为y的最大概率(从t-1时刻中选择到达时刻t且状态为y的状态y0)
(prob, state) = max([(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
V[t][y] = prob
newpath[y] = path[state] + [y] # 只保存概率最大的一种路径
path = newpath
# 求出最后一个字哪一种状态的对应概率最大,最后一个字只可能是两种情况:E(结尾)和S(独立词)
(prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')
return (prob, path[state])
def get_voc(viterbi, sentence): # 将标注转换为生词
prob, pos_list = viterbi
voc = []
begin, nexti = 0, 0
for i, char in enumerate(sentence):
pos = pos_list[i]
if pos == "B":
begin = i
elif pos == "E":
voc.append(sentence[begin: i + 1])
nexti = i + 1
elif pos == "S":
voc.append(char)
nexti = i + 1
if nexti < len(sentence):
voc.append(sentence[nexti:])
return voc
def resolve_filename(f):
try:
return f.name
except AttributeError:
return repr(f)
def get_dict_file(file):
return open(file, "rb")
def gen_pfdict(file):
lfreq = {}
ltotal = 0
f = get_dict_file(file)
f_name = resolve_filename(f) # 获取文件名称 这里不太懂
for lineno, line in enumerate(f, 1):
try:
line = line.strip().decode("utf-8")
word, freq = line.split(" ")[: 2]
freq = int(freq)
lfreq[word] = freq
ltotal += freq
for ch in range(len(word)):
wfrag = word[: ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
except ValueError:
raise ValueError("invalid dictionary entry in %s at line %s: %s" % (f_name, lineno, line))
f.close()
return lfreq, ltotal
def get_DAG(sentence, freq, total):
DAG = {}
N = len(sentence)
if type(freq) != dict:
print(freq)
for k in range(N):
tmplist = []
i = k
frag = sentence[k]
while i < N and frag in freq:
if freq[frag]:
tmplist.append(i)
i += 1
frag = sentence[k: i + 1]
if not tmplist:
tmplist.append(k)
DAG[k] = tmplist
return DAG
def calc(sentence, DAG, route, total, freq):
N = len(sentence)
route[N] = (0, 0) # (概率对数, 词语末字位置)
logtotal = log(total)
for i in range(N - 1, -1, -1):
route[i] = max(
(log(freq.get(sentence[i: x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[i]
)
def cut_DAG_NO_HMM(sentence, freq, total): # 直接利用现有词典进行分词,并根据有向无环图和动态规划得到最后分词
DAG = get_DAG(sentence, freq, total)
# route = {}
calc(sentence, DAG, route, total, freq)
x = 0
N = len(sentence)
buf = ""
lst = []
while x < N:
y = route[x][1] + 1
l_word = sentence[x: y]
if re_eng.match(l_word) and len(l_word) == 1: # 识别数字,字母
buf += l_word # 将未识别的数字、字母连接到一起,汉字每次输出一个字
x = y
else:
if buf:
lst.append(buf)
buf = ""
lst.append(l_word)
x = y
if buf:
lst.append(buf)
buf = ""
return lst
def cut_DAG(sentence, freq, total):
DAG = get_DAG(sentence, freq, ltotal)
route = {}
calc(sentence, DAG, route, total, freq)
x = 0
N = len(sentence)
buf = ""
lst = []
while x < N:
y = route[x][1] + 1
l_word = sentence[x: y]
# print(l_word)
if y - x == 1:
buf += l_word
else:
if buf:
if len(buf) == 1:
lst.append(buf)
buf = ""
elif not freq.get(buf):
blocks = re_han_cut_all.split(buf)
for blk in blocks:
if re_han_cut_all.match(blk):
vit = viterbi(blk, states, start_p, trans_p, emit_p) # 这里有点问题,因为没有进行汉字分词
recognized = get_voc(vit, blk)
for t in recognized:
lst.append(t)
else:
tmp = re_skip_vit.split(blk)
for x in tmp:
if x:
lst.append(x)
else:
for elem in buf:
lst.append(elem)
buf = ""
lst.append(l_word)
x = y
if buf:
if len(buf) == 1:
lst.append(buf)
elif not freq.get(buf):
blocks = re_han_cut_all.split(buf)
for blk in blocks:
if re_han_cut_all.match(blk):
vit = viterbi(blk, states, start_p, trans_p, emit_p) # 这里有点问题,因为没有进行汉字分词
recognized = get_voc(vit, blk)
for t in recognized:
lst.append(t)
else:
tmp = re_skip_vit.split(blk)
for x in tmp:
if x:
lst.append(x)
else:
for elem in buf:
lst.append(elem)
return lst
def cut_all_possible(sentence, freq, total):
DAG = get_DAG(sentence, freq, total)
old_j = -1
lst = []
for k, L in DAG.items():
if len(L) == 1 and k > old_j:
lst.append(sentence[k: L[0] + 1])
old_j = L[0]
else:
for j in L:
if j > k:
lst.append(sentence[k: j + 1])
old_j = j
return lst
def cut(sentence, cut_all=False, HMM=False):
lfreq, ltotal = gen_pfdict(dict_dir)
DAG = get_DAG(sentence, lfreq, ltotal)
calc(sentence, DAG, route, ltotal, lfreq)
lst = []
if cut_all:
re_han = re_han_default
re_skip = re_skip_default
else:
re_han = re_han_cut_all
re_skip = re_skip_cut_all
if cut_all:
cut_block = cut_all_possible
elif HMM:
cut_block = cut_DAG
else:
cut_block = cut_DAG_NO_HMM
blocks = re_han.split(sentence)
for blk in blocks:
if not blk:
continue
if re_han.match(blk):
for word in cut_block(blk, lfreq, ltotal):
lst.append(word)
else:
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
lst.append(x)
elif not cut_all:
for elem in x:
lst.append(x)
else:
lst.append(x)
return lst
sentence = "北京欢迎你"
print(cut(sentence, cut_all=False, HMM=False))
"""
sentence = "北京欢迎你aaa aa"
lfreq, ltotal = gen_pfdict(dict_dir)
DAG = get_DAG(sentence, lfreq, ltotal)
calc(sentence, DAG, route, ltotal, lfreq)
# print(route)
print(cut_DAG_NO_HMM(sentence, ltotal, lfreq))
print("*" * 50)
print(cut_DAG(sentence, ltotal, lfreq))
print("*" * 50)
print(cut_all_possible(sentence, lfreq, ltotal))
"""
"""
sentence = "大海欢迎你"
sentence = strdecode(sentence)
vit = viterbi(sentence, states, start_p, trans_p, emit_p)
voc = get_voc(vit, sentence)
print(voc)
"""
print()
在jieba库中 HMM的训练是通过统计得到的,就是那一堆转移矩阵、初始矩阵,通过简单统计得到,不饶清楚这样对不对,因为我在单独使用viterbi进行分词,发现使用像“北京欢迎你”的语句都切的不对。或许是语料的问题吧?