原理为由频率直方图估计概率,然后计算互信息。结果会有误差,其中data为多行2列数组
import numpy as np
def MI(data, bins=300):
hist_dd, bins_dd = np.histogramdd(data, bins=bins, density=False)
hist_x, bins_x = np.histogram(data[:, 0], bins=bins, density=False)
hist_y, bins_y = np.histogram(data[:, 1], bins=bins, density=False)
# 计算概率
hist_dd = hist_dd / len(data)
hist_x = hist_x / len(data)
hist_y = hist_y / len(data)
# 计算互信息
mi = 0
for i in range(len(bins_x) - 1):
for j in range(len(bins_y) - 1):
p_xy = hist_dd[i, j]
p_x = hist_x[i]
p_y = hist_y[j]
if p_xy == 0 or p_x == 0 or p_y == 0:
continue
mi += p_xy * np.log2(p_xy / (p_x * p_y))
print("互信息为:%f" % (mi))
return mi
---------------------------------------更新-归一化互信息-----------------------------------------------
import numpy as np
def MI(data):
hist_dd, bins_dd = np.histogramdd(data, bins=500, density=False)
hist_x, bins_x = np.histogram(data[:, 0], bins=500, density=False)
hist_y, bins_y = np.histogram(data[:, 1], bins=500, density=False)
# 计算概率
hist_dd = hist_dd / len(data)
hist_x = hist_x / len(data)
hist_y = hist_y / len(data)
# 计算互信息
mi = 0
for i in range(len(bins_x) - 1):
for j in range(len(bins_y) - 1):
p_xy = hist_dd[i, j]
p_x = hist_x[i]
p_y = hist_y[j]
if p_xy == 0 or p_x == 0 or p_y == 0:
continue
mi += p_xy * np.log2(p_xy / (p_x * p_y))
h_x = 0
for i in range(len(bins_x) - 1):
p_x = hist_x[i]
if p_x == 0:
continue
h_x += -p_x * np.log2(p_x)
h_y = 0
for i in range(len(bins_y) - 1):
p_y = hist_y[i]
if p_y == 0:
continue
h_y += -p_y * np.log2(p_y)
mi = 2*mi / (h_x + h_y)
# print("互信息为:%f" % (mi))
return mi