对jieba分词的分词模型进行抽取，简单实现

最新推荐文章于 2022-11-09 22:16:53 发布

z2539329562

最新推荐文章于 2022-11-09 22:16:53 发布

阅读量1.4k

点赞数

分类专栏：人工智能，调bug 文章标签： jieba 源码

本文链接：https://blog.csdn.net/z2539329562/article/details/88900798

版权

人工智能，调bug 专栏收录该内容

61 篇文章 3 订阅

订阅专栏

因为水平一般，所以将jieba库中分词模块的代码进行简化，这里全部使用函数实现

import jieba
from jieba.finalseg.prob_emit import P as emit_p
from jieba.finalseg.prob_start import P as start_p
from jieba.finalseg.prob_trans import P as trans_p
import os
from math import log
import re
init_dir = jieba.__file__
jieba_dir = os.path.dirname(init_dir)
dict_dir = jieba_dir + r"\dict.txt"
re_eng = re.compile("[a-zA-Z0-9]", re.U)

re_han_default = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#\._]+)", re.U)# 汉字码、非空白字符
re_skip_default = re.compile("(\r\n|\s)", re.U) # 换行或空白

re_han_cut_all = re.compile("([\u4E00-\u9FA5]+)", re.U)# 全局模式 只包含汉字
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)# 只保留字母数字+#
re_skip_vit = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")# 匹配数字(包含小数)或字母数字
route = {}


text_type = str

MIN_FLOAT = -3.14e100
states = "BMES"

PrevStatus = {
    'B': 'ES',
    'M': 'MB',
    'S': 'SE',
    'E': 'BM'
}


def strdecode(sentence):
    if not isinstance(sentence, text_type):  # 非Unicode
        try:
            sentence = sentence.decode('utf-8')  # utf-8解码为Unicode
        except UnicodeDecodeError:  # UnicodeDecodeError则用gbk解码为Unicode
            sentence = sentence.decode('gbk', 'ignore')  # 设置为ignore，则会忽略非法字符
    return sentence


def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]  # 状态概率矩阵
    path = {}
    for y in states:  # 初始化状态概率
        V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
        path[y] = [y]  # 记录路径
    for t in range(1, len(obs)):
        V.append({})
        newpath = {}
        for y in states:
            em_p = emit_p[y].get(obs[t], MIN_FLOAT)
            # t时刻状态为y的最大概率(从t-1时刻中选择到达时刻t且状态为y的状态y0)
            (prob, state) = max([(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
            V[t][y] = prob
            newpath[y] = path[state] + [y]  # 只保存概率最大的一种路径
        path = newpath
        # 求出最后一个字哪一种状态的对应概率最大，最后一个字只可能是两种情况：E(结尾)和S(独立词)
    (prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')

    return (prob, path[state])


def get_voc(viterbi, sentence):  # 将标注转换为生词
    prob, pos_list = viterbi
    voc = []
    begin, nexti = 0, 0
    for i, char in enumerate(sentence):
        pos = pos_list[i]
        if pos == "B":
            begin = i
        elif pos == "E":
            voc.append(sentence[begin: i + 1])
            nexti = i + 1
        elif pos == "S":
            voc.append(char)
            nexti = i + 1
    if nexti < len(sentence):
        voc.append(sentence[nexti:])
    return voc


def resolve_filename(f):
    try:
        return f.name
    except AttributeError:
        return repr(f)


def get_dict_file(file):
    return open(file, "rb")


def gen_pfdict(file):
    lfreq = {}
    ltotal = 0
    f = get_dict_file(file)
    f_name = resolve_filename(f)  # 获取文件名称  这里不太懂
    for lineno, line in enumerate(f, 1):
        try:
            line = line.strip().decode("utf-8")
            word, freq = line.split(" ")[: 2]
            freq = int(freq)
            lfreq[word] = freq
            ltotal += freq
            for ch in range(len(word)):
                wfrag = word[: ch + 1]
                if wfrag not in lfreq:
                    lfreq[wfrag] = 0
        except ValueError:
            raise ValueError("invalid dictionary entry in %s at line %s: %s" % (f_name, lineno, line))
    f.close()
    return lfreq, ltotal


def get_DAG(sentence, freq, total):
    DAG = {}
    N = len(sentence)
    if type(freq) != dict:
        print(freq)
    for k in range(N):
        tmplist = []
        i = k
        frag = sentence[k]
        while i < N and frag in freq:
            if freq[frag]:
                tmplist.append(i)
            i += 1
            frag = sentence[k: i + 1]
        if not tmplist:
            tmplist.append(k)
        DAG[k] = tmplist
    return DAG


def calc(sentence, DAG, route, total, freq):
    N = len(sentence)
    route[N] = (0, 0)  # (概率对数， 词语末字位置)
    logtotal = log(total)
    for i in range(N - 1, -1, -1):
        route[i] = max(
            (log(freq.get(sentence[i: x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[i]
        )


def cut_DAG_NO_HMM(sentence, freq, total):  # 直接利用现有词典进行分词，并根据有向无环图和动态规划得到最后分词
    DAG = get_DAG(sentence, freq, total)
    # route = {}
    calc(sentence, DAG, route, total, freq)
    x = 0
    N = len(sentence)
    buf = ""
    lst = []

    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x: y]
        if re_eng.match(l_word) and len(l_word) == 1:  # 识别数字，字母
            buf += l_word  # 将未识别的数字、字母连接到一起，汉字每次输出一个字
            x = y
        else:
            if buf:
                lst.append(buf)
                buf = ""
            lst.append(l_word)
            x = y
    if buf:
        lst.append(buf)
        buf = ""
    return lst


def cut_DAG(sentence, freq, total):
    DAG = get_DAG(sentence, freq, ltotal)
    route = {}
    calc(sentence, DAG, route, total, freq)
    x = 0
    N = len(sentence)
    buf = ""
    lst = []
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x: y]
        # print(l_word)
        if y - x == 1:
            buf += l_word
        else:
            if buf:
                if len(buf) == 1:
                    lst.append(buf)
                    buf = ""
                elif not freq.get(buf):
                    blocks = re_han_cut_all.split(buf)
                    for blk in blocks:
                        if re_han_cut_all.match(blk):
                            vit = viterbi(blk, states, start_p, trans_p, emit_p)  # 这里有点问题，因为没有进行汉字分词
                            recognized = get_voc(vit, blk)
                            for t in recognized:
                                lst.append(t)
                else:
                    tmp = re_skip_vit.split(blk)
                    for x in tmp:
                        if x:
                            lst.append(x)
                    else:
                        for elem in buf:
                            lst.append(elem)
                    buf = ""
            lst.append(l_word)
        x = y
    if buf:
        if len(buf) == 1:
            lst.append(buf)
        elif not freq.get(buf):
            blocks = re_han_cut_all.split(buf)
            for blk in blocks:
                if re_han_cut_all.match(blk):
                    vit = viterbi(blk, states, start_p, trans_p, emit_p)  # 这里有点问题，因为没有进行汉字分词
                    recognized = get_voc(vit, blk)
                    for t in recognized:
                        lst.append(t)
                else:
                    tmp = re_skip_vit.split(blk)
                    for x in tmp:
                        if x:
                            lst.append(x)
        else:
            for elem in buf:
                lst.append(elem)
    return lst


def cut_all_possible(sentence, freq, total):
    DAG = get_DAG(sentence, freq, total)
    old_j = -1
    lst = []
    for k, L in DAG.items():
        if len(L) == 1 and k > old_j:
            lst.append(sentence[k: L[0] + 1])
            old_j = L[0]
        else:
            for j in L:
                if j > k:
                    lst.append(sentence[k: j + 1])
                    old_j = j
    return lst


def cut(sentence, cut_all=False, HMM=False):
    lfreq, ltotal = gen_pfdict(dict_dir)
    DAG = get_DAG(sentence, lfreq, ltotal)
    calc(sentence, DAG, route, ltotal, lfreq)
    lst = []
    if cut_all:
        re_han = re_han_default
        re_skip = re_skip_default
    else:
        re_han = re_han_cut_all
        re_skip = re_skip_cut_all

    if cut_all:
        cut_block = cut_all_possible
    elif HMM:
        cut_block = cut_DAG
    else:
        cut_block = cut_DAG_NO_HMM
    blocks = re_han.split(sentence)
    for blk in blocks:
        if not blk:
            continue
        if re_han.match(blk):
            for word in cut_block(blk, lfreq, ltotal):
                lst.append(word)
        else:
            tmp = re_skip.split(blk)
            for x in tmp:
                if re_skip.match(x):
                    lst.append(x)
                elif not cut_all:
                    for elem in x:
                        lst.append(x)
                else:
                    lst.append(x)
    return lst


sentence = "北京欢迎你"
print(cut(sentence, cut_all=False, HMM=False))

"""
sentence = "北京欢迎你aaa aa"
lfreq, ltotal = gen_pfdict(dict_dir)
DAG = get_DAG(sentence, lfreq, ltotal)
calc(sentence, DAG, route, ltotal, lfreq)
# print(route)
print(cut_DAG_NO_HMM(sentence, ltotal, lfreq))
print("*" * 50)
print(cut_DAG(sentence, ltotal, lfreq))
print("*" * 50)
print(cut_all_possible(sentence, lfreq, ltotal))
"""
"""
sentence = "大海欢迎你"
sentence = strdecode(sentence)
vit = viterbi(sentence, states, start_p, trans_p, emit_p)
voc = get_voc(vit, sentence)
print(voc)
"""

print()

在jieba库中 HMM的训练是通过统计得到的，就是那一堆转移矩阵、初始矩阵，通过简单统计得到，不饶清楚这样对不对，因为我在单独使用viterbi进行分词，发现使用像“北京欢迎你”的语句都切的不对。或许是语料的问题吧？