问题描述
首先阅读Entropy_of_English_PeterBrown,参考上面的文章来计算中文的平均信息熵。数据库: https://share.weiyun.com/5zGPyJX
实验原理
信息熵
信息熵的概念最早由香农(1916-2001)于1948年借鉴热力学中的“热熵”的概念提出,旨在表示信息的不确定性。熵值越大,则信息的不确定程度越大。其数学公式可以表示为:
H
(
x
)
=
∑
x
∈
X
P
(
x
)
l
o
g
(
1
P
(
x
)
)
=
−
∑
x
∈
X
P
(
x
)
l
o
g
(
P
(
x
)
)
H(x) =\sum_{x\in X} P(x)log(\frac{1}{P(x)})=-\sum_{x\in X}P(x)log(P(x))
H(x)=x∈X∑P(x)log(P(x)1)=−x∈X∑P(x)log(P(x))
针对于联合分布的随机变量
(
X
,
Y
)
(X,Y)
(X,Y)~
P
(
X
,
Y
)
P(X,Y)
P(X,Y),在两变量相互独立的情况下,其联合信息熵为
H
(
X
∣
Y
)
=
−
∑
y
∈
Y
P
(
y
)
l
o
g
(
P
(
x
∣
y
)
)
=
−
∑
y
∈
Y
P
(
y
)
∑
x
∈
X
P
(
x
)
l
o
g
(
P
(
x
∣
y
)
)
=
−
∑
y
∈
Y
∑
x
∈
X
P
(
x
)
P
(
y
)
l
o
g
(
P
(
x
∣
y
)
)
=
−
∑
y
∈
Y
∑
x
∈
X
P
(
x
,
y
)
l
o
g
(
P
(
x
∣
y
)
)
\begin{aligned} H(X|Y)&= -\sum_{y\in Y} P(y)log(P(x|y))\\ &= -\sum_{y\in Y} P(y) \sum_{x\in X}P(x) log(P(x|y))\\ &=-\sum_{y\in Y}\sum_{x\in X}P(x)P(y)log(P(x|y))\\ &=-\sum_{y\in Y}\sum_{x\in X}P(x,y)log(P(x|y)) \end{aligned}
H(X∣Y)=−y∈Y∑P(y)log(P(x∣y))=−y∈Y∑P(y)x∈X∑P(x)log(P(x∣y))=−y∈Y∑x∈X∑P(x)P(y)log(P(x∣y))=−y∈Y∑x∈X∑P(x,y)log(P(x∣y))
该联合信息熵可以用于后文的二元模型
(
b
i
−
g
r
a
m
)
(bi-gram)
(bi−gram)与三元模型
(
t
r
i
−
g
r
a
m
)
(tri-gram)
(tri−gram)的计算。
语言模型的参数估计
本文采用了一元、二元、三元模型来对金庸小说集的信息熵进行计算。
实验过程
实验数据的预处理
数据集为金庸先生的16本小说,其中包含了大量乱码与无用或重复的中英文符号,因此需要对该实验数据集进行预处理。
1.删除所有的隐藏符号。
2.删除所有的非中文字符。
3.不考虑上下文关系的前提下删去所有的标点符号。
这里的预处理用jieba进行分词。jieba是python中的一个中文分词库,在本实验中以精确模式进行分词。
def getCorpus(self, rootDir):
corpus = []
r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./::;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
for file in os.listdir(rootDir):
path = os.path.join(rootDir, file)
if os.path.isfile(path):
with open(os.path.abspath(path), "r", encoding='utf-8') as file:
filecontext = file.read();
filecontext = re.sub(r1, '', filecontext)
filecontext = filecontext.replace("\n", '')
filecontext = filecontext.replace(" ", '')
seg_list = jieba.lcut_for_search(filecontext)
corpus += seg_list
elif os.path.isdir(path):
TraversalFun.AllFiles(self, path)
return corpus
一元模型
for uni_word in words_tf.items():
entropy.append(-(uni_word[1]/words_len)*math.log(uni_word[1]/words_len, 2))
二元模型
for bi_word in bigram_tf.items():
jp_xy = bi_word[1] / bigram_len # 计算联合概率p(x,y)
cp_xy = bi_word[1] / words_tf[bi_word[0][0]] # 计算条件概率p(x|y)
entropy.append(-jp_xy * math.log(cp_xy, 2)) # 计算二元模型的信息熵
print("基于词的二元模型的中文信息熵为:", round(sum(entropy), 3), "比特/词")
三元模型
for tri_word in trigram_tf.items():
jp_xy = tri_word[1] / trigram_len # 计算联合概率p(x,y)
cp_xy = tri_word[1] / words_tf[tri_word[0][0]] # 计算条件概率p(x|y)
entropy.append(-jp_xy * math.log(cp_xy, 2)) # 计算三元模型的信息熵
print("基于词的三元模型的中文信息熵为:", round(sum(entropy), 3), "比特/词")
实验结果
语料信息统计
小说名称 | 语料字数 | 分词个数 | 平均词长 | 信息熵(比特/词) | 运行时长 | |
---|---|---|---|---|---|---|
1 | 三十三剑客图 | 53682 | 31551 | 1.70144 | 11.65358 | 0.82091 |
2 | 白马啸西风 | 64061 | 42207 | 1.51778 | 9.49565 | 0.84298 |
3 | 书剑恩仇录 | 438338 | 255908 | 1.71287 | 11.68502 | 2.50098 |
4 | 侠客行 | 317099 | 190545 | 1.66417 | 10.99487 | 2.14651 |
5 | 倚天屠龙记 | 830439 | 487540 | 1.70332 | 11.63975 | 4.22489 |
6 | 天龙八部 | 830439 | 487540 | 1.70332 | 11.63975 | 4.4675 |
7 | 射雕英雄传 | 794124 | 480018 | 1.65436 | 11.55264 | 4.8816 |
8 | 碧血剑 | 420068 | 246405 | 1.70479 | 11.69014 | 2.6309 |
9 | 神雕侠侣 | 835666 | 502841 | 1.66189 | 11.51878 | 4.92906 |
10 | 笑傲江湖 | 833372 | 490987 | 1.69734 | 11.32648 | 4.64088 |
11 | 越女剑 | 14803 | 9293 | 1.59292 | 9.42809 | 0.58128 |
12 | 连城诀 | 199412 | 122177 | 1.63216 | 10.84085 | 1.59599 |
13 | 雪山飞狐 | 119513 | 73383 | 1.62862 | 10.773 | 1.16091 |
14 | 飞狐外传 | 378777 | 224513 | 1.6871 | 11.46932 | 2.26253 |
15 | 鸳鸯刀 | 32552 | 20658 | 1.57576 | 9.71726 | 0.65679 |
16 | 鹿鼎记 | 1047468 | 626266 | 1.66459 | 11.25894 | 5.1325 |
不同模型下的实验结果
分词模型 | 语料字数 | 分词个数 | 平均词长 | 信息熵(比特/词) | 运行时长 | |
---|---|---|---|---|---|---|
1 | unigram | 7420081 | 4430767 | 1.67467 | 12.01312 | 37.68415 |
2 | bigram | 7420081 | 4430767 | 1.67467 | 6.8915 | 41.43119 |
3 | trigram | 7420081 | 4430767 | 1.67467 | 6.8915 | 60.37625 |
附录
import jieba
import math
import time
import os
import re
class TraversalFun():
# 1 初始化
def __init__(self, rootDir):
self.rootDir = rootDir
def TraversalDir(self):
return TraversalFun.getCorpus(self, self.rootDir)
def getCorpus(self, rootDir):
corpus = []
r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./::;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+' # 用户也可以在此进行自定义过滤字符
listdir = os.listdir(rootDir)
count=0
for file in listdir:
path = os.path.join(rootDir, file)
if os.path.isfile(path):
with open(os.path.abspath(path), "r", encoding='ansi') as file:
filecontext = file.read();
filecontext = re.sub(r1, '', filecontext)
filecontext = filecontext.replace("\n", '')
filecontext = filecontext.replace(" ", '')
filecontext = filecontext.replace("本书来自www.cr173.com免费txt小说下载站\n更多更新免费电子书请关注www.cr173.com", '')
#seg_list = jieba.cut(filecontext, cut_all=True)
#corpus += seg_list
count += len(filecontext)
corpus.append(filecontext)
elif os.path.isdir(path):
TraversalFun.AllFiles(self, path)
return corpus,count
# 词频统计,方便计算信息熵
def get_tf(tf_dic, words):
for i in range(len(words)-1):
tf_dic[words[i]] = tf_dic.get(words[i], 0) + 1
def get_bigram_tf(tf_dic, words):
for i in range(len(words)-1):
tf_dic[(words[i], words[i+1])] = tf_dic.get((words[i], words[i+1]), 0) + 1
def get_trigram_tf(tf_dic, words):
for i in range(len(words)-2):
tf_dic[((words[i], words[i+1]), words[i+2])] = tf_dic.get(((words[i], words[i+1]), words[i+2]), 0) + 1
def cal_unigram(corpus,count):
before = time.time()
split_words = []
words_len = 0
line_count = 0
words_tf = {}
for line in corpus:
for x in jieba.cut(line):
split_words.append(x)
words_len += 1
get_tf(words_tf, split_words)
split_words = []
line_count += 1
print("语料库字数:", count)
print("分词个数:", words_len)
print("平均词长:", round(count / words_len, 5))
entropy = []
for uni_word in words_tf.items():
entropy.append(-(uni_word[1] / words_len) * math.log(uni_word[1] / words_len, 2))
print("基于词的一元模型的中文信息熵为:", round(sum(entropy), 5), "比特/词")
after = time.time()
print("运行时间:", round(after - before, 5), "s")
def cal_bigram(corpus, count):
before = time.time()
split_words = []
words_len = 0
line_count = 0
words_tf = {}
bigram_tf = {}
for line in corpus:
for x in jieba.cut(line):
split_words.append(x)
words_len += 1
get_tf(words_tf, split_words)
get_bigram_tf(bigram_tf, split_words)
split_words = []
line_count += 1
print("语料库字数:", count)
print("分词个数:", words_len)
print("平均词长:", round(count / words_len, 5))
bigram_len = sum([dic[1] for dic in bigram_tf.items()])
print("二元模型长度:", bigram_len)
entropy = []
for bi_word in bigram_tf.items():
jp_xy = bi_word[1] / bigram_len # 计算联合概率p(x,y)
cp_xy = bi_word[1] / words_tf[bi_word[0][0]] # 计算条件概率p(x|y)
entropy.append(-jp_xy * math.log(cp_xy, 2)) # 计算二元模型的信息熵
print("基于词的二元模型的中文信息熵为:", round(sum(entropy), 5), "比特/词")
after = time.time()
print("运行时间:", round(after - before, 5), "s")
def cal_trigram(corpus,count):
before = time.time
split_words = []
words_len = 0
line_count = 0
words_tf = {}
trigram_tf = {}
for line in corpus:
for x in jieba.cut(line):
split_words.append(x)
words_len += 1
get_bigram_tf(words_tf, split_words)
get_trigram_tf(trigram_tf, split_words)
split_words = []
line_count += 1
print("语料库字数:", count)
print("分词个数:", words_len)
print("平均词长:", round(count / words_len, 5))
trigram_len = sum([dic[1] for dic in trigram_tf.items()])
print("三元模型长度:", trigram_len)
entropy = []
for tri_word in trigram_tf.items():
jp_xy = tri_word[1] / trigram_len # 计算联合概率p(x,y)
cp_xy = tri_word[1] / words_tf[tri_word[0][0]] # 计算条件概率p(x|y)
entropy.append(-jp_xy * math.log(cp_xy, 2)) # 计算三元模型的信息熵
print("基于词的三元模型的中文信息熵为:", round(sum(entropy), 5), "比特/词")
after = time.time()
print("运行时间:", round(after - before , 5), "s")
if __name__ == '__main__':
tra = TraversalFun("./datasets")
corpus,count = tra.TraversalDir()
cal_unigram(corpus, count)
cal_bigram(corpus,count)
cal_trigram(corpus,count)
实验代码
该实验的代码可以在 实验代码 找到