前言
#最大熵原理
在满足已知约束的条件下,对未知的部分都是“等可能的”,也就是概率相等,在概率相等的情况下,熵达到最大。
通过书中的例6.1来看下。
例6.1 假设随机变量X有5个取值{A, B, C,D, E},要估计各个值的概率P(A), P(B), P©, P(D), P(E).
解:
这些概率值满足以下约束条件:
P(A)+ P(B) + P© + P(D) + P(E) = 1
在没有其他约束条件的情况下,根据“最大熵原理”,最合理的判断就是所有取值的概率相等,即:
P(A)= P(B) = P© = P(D) = P(E) = 1/5
有时,我们能从一些先验知识中得到一些约束条件,如:
P(A)+ P(B) = 3/10
P(A)+ P(B) + P© + P(D) + P(E) = 1
那根据“最大熵原理”,最合理的判断就是所有A和B取值的概率相等,C、D、E平分剩下的概率,即:
P(A)= P(B) = 3/20
P©= P(D) = P(E) = 7/30
在已知的约束条件下,还是有无数种概率分布,因此我们需要通过学习来找出最优的模型。
推理的话,《统计学习方法》这本书已经是诠释的很好了,本文的重点在于实现。
实现
模型:
P
w
(
y
∣
x
)
=
exp
(
∑
i
=
1
n
w
i
f
i
(
x
,
y
)
)
∑
y
exp
(
∑
i
=
1
n
w
i
f
i
(
x
,
y
)
)
{P_w}\left( {y|x} \right) = \frac{{\exp \left( {\sum\limits_{i = 1}^n {{w_i}{f_i}\left( {x,y} \right)} } \right)}}{{\sum\limits_y {\exp \left( {\sum\limits_{i = 1}^n {{w_i}{f_i}\left( {x,y} \right)} } \right)} }}
Pw(y∣x)=y∑exp(i=1∑nwifi(x,y))exp(i=1∑nwifi(x,y))
目标函数:
min
f
(
w
)
=
∑
x
P
~
(
x
)
log
∑
y
exp
(
∑
i
=
1
n
w
i
f
i
(
x
,
y
)
)
−
∑
x
,
y
P
~
(
x
,
y
)
∑
i
=
1
n
w
i
f
i
(
x
,
y
)
\min f\left( w \right) = \sum\limits_x {\tilde P\left( x \right)\log \sum\limits_y {\exp \left( {\sum\limits_{i = 1}^n {{w_i}{f_i}\left( {x,y} \right)} } \right) - \sum\limits_{x,y} {\tilde P\left( {x,y} \right)\sum\limits_{i = 1}^n {{w_i}{f_i}\left( {x,y} \right)} } } }
minf(w)=x∑P~(x)logy∑exp(i=1∑nwifi(x,y))−x,y∑P~(x,y)i=1∑nwifi(x,y)
梯度:
∂
f
(
w
)
∂
w
i
=
∑
x
,
y
P
~
(
x
)
P
w
(
y
∣
x
)
f
i
(
x
,
y
)
−
E
p
~
(
f
i
)
\frac{{\partial f\left( w \right)}}{{\partial {w_i}}} = \sum\limits_{x,y} {\tilde P\left( x \right){P_w}\left( {y|x} \right){f_i}\left( {x,y} \right) - {E_{\tilde p}}\left( {{f_i}} \right)}
∂wi∂f(w)=x,y∑P~(x)Pw(y∣x)fi(x,y)−Ep~(fi)
# --*-- coding:utf-8 --*--
import numpy as np
class MaxEntropy:
def __init__(self):
self.weights = None
self.feature_dict = None
pass
def train(self, dataset, labels, max_iter = 1000):
#将样本集([1, "M"], [1, "M"], [1, "M"])和标签集[-1, -1, 1] ->{[1, "M"]:{-1:2, 1:1}}
feature_dict = dict() #特征函数->事实的集合,比如有数据,f = (x, y) = ([1, 2],1) ->是1
n = len(dataset) #样本数目
alpha = 0.01 #学习率
for data, label in zip(dataset, labels): #同时遍历两个列表
data = str(data)
if (feature_dict.get(data) == None): #{-1:2, 1:1}
feature_dict[data] = dict()
label_dict = feature_dict.get(data)
if (label_dict.get(label) == None):
label_dict[label] = (1, 0) #(count, weight)
else:
count = label_dict[label][0]
weight = label_dict[label][1]
label_dict[label] = (count+1, weight)
# label_dict[label] = label_dict.get(label, 0) + 1 #自增
self.feature_dict = feature_dict
for i in range(max_iter):
for data, label_dict in self.feature_dict.items():
P_marginal_empirical_x = 0
for label, count_weight_tuple in label_dict.items():
count = count_weight_tuple[0]
P_marginal_empirical_x += count / n
for label, count_weight_tuple in label_dict.items():
count = count_weight_tuple[0]
weight = count_weight_tuple[1]
new_weight = weight - alpha * (P_marginal_empirical_x * count * self.predict(data, label) - count / n * count)
print(new_weight)
self.feature_dict[data][label] = (count, new_weight)
print(feature_dict)
def predict(self, data, label):#预测label=1的概率
data = str(data)
numerator = 0 #分子
denominator = 0 #分母
for key, count_weight_tuple in self.feature_dict[data].items(): #遍历{-1: (2, 0), 1: (1, 0)}
count = count_weight_tuple[0] # 2
weight = count_weight_tuple[1] # 0
if (key == label):
numerator = np.exp(weight * count)
denominator += np.exp(weight * count)
return numerator / denominator
if __name__ == '__main__':
dataSet = [[1, "S"], [1, "M"], [1, "M"], [1, "S"], [1, "S"],
[2, "S"], [2, "M"], [2, "M"], [2, "L"], [2, "L"],
[3, "L"], [3, "M"], [3, "M"], [3, "L"], [3, "L"]]
labels = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
maxEntropy = MaxEntropy()
maxEntropy.train(dataSet, labels)
print(maxEntropy.predict([1, "M"], 1))