深度学习与自然语言处理实验——中文信息熵的计算

问题描述

首先阅读Entropy_of_English_PeterBrown,参考上面的文章来计算中文的平均信息熵。数据库: https://share.weiyun.com/5zGPyJX

实验原理

信息熵

信息熵的概念最早由香农(1916-2001)于1948年借鉴热力学中的“热熵”的概念提出,旨在表示信息的不确定性。熵值越大,则信息的不确定程度越大。其数学公式可以表示为:
H ( x ) = ∑ x ∈ X P ( x ) l o g ( 1 P ( x ) ) = − ∑ x ∈ X P ( x ) l o g ( P ( x ) ) H(x) =\sum_{x\in X} P(x)log(\frac{1}{P(x)})=-\sum_{x\in X}P(x)log(P(x)) H(x)=xXP(x)log(P(x)1)=xXP(x)log(P(x))
针对于联合分布的随机变量 ( X , Y ) (X,Y) (X,Y)~ P ( X , Y ) P(X,Y) P(X,Y),在两变量相互独立的情况下,其联合信息熵为

H ( X ∣ Y ) = − ∑ y ∈ Y P ( y ) l o g ( P ( x ∣ y ) ) = − ∑ y ∈ Y P ( y ) ∑ x ∈ X P ( x ) l o g ( P ( x ∣ y ) ) = − ∑ y ∈ Y ∑ x ∈ X P ( x ) P ( y ) l o g ( P ( x ∣ y ) ) = − ∑ y ∈ Y ∑ x ∈ X P ( x , y ) l o g ( P ( x ∣ y ) ) \begin{aligned} H(X|Y)&= -\sum_{y\in Y} P(y)log(P(x|y))\\ &= -\sum_{y\in Y} P(y) \sum_{x\in X}P(x) log(P(x|y))\\ &=-\sum_{y\in Y}\sum_{x\in X}P(x)P(y)log(P(x|y))\\ &=-\sum_{y\in Y}\sum_{x\in X}P(x,y)log(P(x|y)) \end{aligned} H(XY)=yYP(y)log(P(xy))=yYP(y)xXP(x)log(P(xy))=yYxXP(x)P(y)log(P(xy))=yYxXP(x,y)log(P(xy))
该联合信息熵可以用于后文的二元模型 ( b i − g r a m ) (bi-gram) (bigram)与三元模型 ( t r i − g r a m ) (tri-gram) (trigram)的计算。

语言模型的参数估计

本文采用了一元、二元、三元模型来对金庸小说集的信息熵进行计算。

实验过程

实验数据的预处理

数据集为金庸先生的16本小说,其中包含了大量乱码与无用或重复的中英文符号,因此需要对该实验数据集进行预处理。
1.删除所有的隐藏符号。
2.删除所有的非中文字符。
3.不考虑上下文关系的前提下删去所有的标点符号。

这里的预处理用jieba进行分词。jieba是python中的一个中文分词库,在本实验中以精确模式进行分词。

def getCorpus(self, rootDir):
    corpus = []
    r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./::;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+' 
    for file in os.listdir(rootDir):
        path  = os.path.join(rootDir, file)
        if os.path.isfile(path):
            with open(os.path.abspath(path), "r", encoding='utf-8') as file:
                filecontext = file.read();
                filecontext = re.sub(r1, '', filecontext)
                filecontext = filecontext.replace("\n", '')
                filecontext = filecontext.replace(" ", '')
                seg_list = jieba.lcut_for_search(filecontext)
                corpus += seg_list
        elif os.path.isdir(path):
            TraversalFun.AllFiles(self, path)
    return corpus

一元模型

    for uni_word in words_tf.items():
        entropy.append(-(uni_word[1]/words_len)*math.log(uni_word[1]/words_len, 2))

二元模型

    for bi_word in bigram_tf.items():
        jp_xy = bi_word[1] / bigram_len  # 计算联合概率p(x,y)
        cp_xy = bi_word[1] / words_tf[bi_word[0][0]]  # 计算条件概率p(x|y)
        entropy.append(-jp_xy * math.log(cp_xy, 2))  # 计算二元模型的信息熵
    print("基于词的二元模型的中文信息熵为:", round(sum(entropy), 3), "比特/词")  

三元模型

    for tri_word in trigram_tf.items():
        jp_xy = tri_word[1] / trigram_len  # 计算联合概率p(x,y)
        cp_xy = tri_word[1] / words_tf[tri_word[0][0]]  # 计算条件概率p(x|y)
        entropy.append(-jp_xy * math.log(cp_xy, 2))  # 计算三元模型的信息熵
    print("基于词的三元模型的中文信息熵为:", round(sum(entropy), 3), "比特/词")  

实验结果

语料信息统计

小说名称语料字数分词个数平均词长信息熵(比特/词)运行时长
1三十三剑客图53682315511.7014411.653580.82091
2白马啸西风64061422071.517789.495650.84298
3书剑恩仇录4383382559081.7128711.685022.50098
4侠客行3170991905451.6641710.994872.14651
5倚天屠龙记8304394875401.7033211.639754.22489
6天龙八部8304394875401.7033211.639754.4675
7射雕英雄传7941244800181.6543611.552644.8816
8碧血剑4200682464051.7047911.690142.6309
9神雕侠侣8356665028411.6618911.518784.92906
10笑傲江湖8333724909871.6973411.326484.64088
11越女剑1480392931.592929.428090.58128
12连城诀1994121221771.6321610.840851.59599
13雪山飞狐119513733831.6286210.7731.16091
14飞狐外传3787772245131.687111.469322.26253
15鸳鸯刀32552206581.575769.717260.65679
16鹿鼎记10474686262661.6645911.258945.1325

不同模型下的实验结果

分词模型语料字数分词个数平均词长信息熵(比特/词)运行时长
1unigram742008144307671.6746712.0131237.68415
2bigram742008144307671.674676.891541.43119
3trigram742008144307671.674676.891560.37625

附录

import jieba
import math
import time
import os
import re
class TraversalFun():

    # 1 初始化
    def __init__(self, rootDir):
        self.rootDir = rootDir

    def TraversalDir(self):
        return TraversalFun.getCorpus(self, self.rootDir)

    def getCorpus(self, rootDir):
        corpus = []
        r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./::;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'  # 用户也可以在此进行自定义过滤字符
        listdir = os.listdir(rootDir)
        count=0
        for file in listdir:
            path  = os.path.join(rootDir, file)
            if os.path.isfile(path):
                with open(os.path.abspath(path), "r", encoding='ansi') as file:
                    filecontext = file.read();
                    filecontext = re.sub(r1, '', filecontext)
                    filecontext = filecontext.replace("\n", '')
                    filecontext = filecontext.replace(" ", '')
                    filecontext = filecontext.replace("本书来自www.cr173.com免费txt小说下载站\n更多更新免费电子书请关注www.cr173.com", '')
                    #seg_list = jieba.cut(filecontext, cut_all=True)
                    #corpus += seg_list
                    count += len(filecontext)
                    corpus.append(filecontext)
            elif os.path.isdir(path):
                TraversalFun.AllFiles(self, path)
        return corpus,count

# 词频统计,方便计算信息熵
def get_tf(tf_dic, words):

    for i in range(len(words)-1):
        tf_dic[words[i]] = tf_dic.get(words[i], 0) + 1

def get_bigram_tf(tf_dic, words):
    for i in range(len(words)-1):
        tf_dic[(words[i], words[i+1])] = tf_dic.get((words[i], words[i+1]), 0) + 1

def get_trigram_tf(tf_dic, words):
    for i in range(len(words)-2):
        tf_dic[((words[i], words[i+1]), words[i+2])] = tf_dic.get(((words[i], words[i+1]), words[i+2]), 0) + 1

def cal_unigram(corpus,count):
    before = time.time()
    split_words = []
    words_len = 0
    line_count = 0
    words_tf = {}
    for line in corpus:
        for x in jieba.cut(line):
            split_words.append(x)
            words_len += 1
        get_tf(words_tf, split_words)
        split_words = []
        line_count += 1

    print("语料库字数:", count)
    print("分词个数:", words_len)
    print("平均词长:", round(count / words_len, 5))
    entropy = []
    for uni_word in words_tf.items():
        entropy.append(-(uni_word[1] / words_len) * math.log(uni_word[1] / words_len, 2))
    print("基于词的一元模型的中文信息熵为:", round(sum(entropy), 5), "比特/词")
    after = time.time()
    print("运行时间:", round(after - before, 5), "s")

def cal_bigram(corpus, count):
    before = time.time()
    split_words = []
    words_len = 0
    line_count = 0
    words_tf = {}
    bigram_tf = {}

    for line in corpus:
        for x in jieba.cut(line):
            split_words.append(x)
            words_len += 1

        get_tf(words_tf, split_words)
        get_bigram_tf(bigram_tf, split_words)

        split_words = []
        line_count += 1

    print("语料库字数:", count)
    print("分词个数:", words_len)
    print("平均词长:", round(count / words_len, 5))

    bigram_len = sum([dic[1] for dic in bigram_tf.items()])
    print("二元模型长度:", bigram_len)

    entropy = []
    for bi_word in bigram_tf.items():
        jp_xy = bi_word[1] / bigram_len  # 计算联合概率p(x,y)
        cp_xy = bi_word[1] / words_tf[bi_word[0][0]]  # 计算条件概率p(x|y)
        entropy.append(-jp_xy * math.log(cp_xy, 2))  # 计算二元模型的信息熵
    print("基于词的二元模型的中文信息熵为:", round(sum(entropy), 5), "比特/词")

    after = time.time()
    print("运行时间:", round(after - before, 5), "s")

def cal_trigram(corpus,count):
    before = time.time
    split_words = []
    words_len = 0
    line_count = 0
    words_tf = {}
    trigram_tf = {}

    for line in corpus:
        for x in jieba.cut(line):
            split_words.append(x)
            words_len += 1

        get_bigram_tf(words_tf, split_words)
        get_trigram_tf(trigram_tf, split_words)

        split_words = []
        line_count += 1

    print("语料库字数:", count)
    print("分词个数:", words_len)
    print("平均词长:", round(count / words_len, 5))

    trigram_len = sum([dic[1] for dic in trigram_tf.items()])
    print("三元模型长度:", trigram_len)

    entropy = []
    for tri_word in trigram_tf.items():
        jp_xy = tri_word[1] / trigram_len  # 计算联合概率p(x,y)
        cp_xy = tri_word[1] / words_tf[tri_word[0][0]]  # 计算条件概率p(x|y)
        entropy.append(-jp_xy * math.log(cp_xy, 2))  # 计算三元模型的信息熵
    print("基于词的三元模型的中文信息熵为:", round(sum(entropy), 5), "比特/词")

    after = time.time()
    print("运行时间:", round(after - before , 5), "s")


if __name__ == '__main__':
    tra = TraversalFun("./datasets")
    corpus,count = tra.TraversalDir()
    cal_unigram(corpus, count)
    cal_bigram(corpus,count)
    cal_trigram(corpus,count)

实验代码

该实验的代码可以在 实验代码 找到

  • 5
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值