第五章 统计思维
KL Divergence
from sklearn import manifold, datasets
from scipy.stats import entropy, pearsonr
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
n_points = 1000
X, color = datasets.make_s_curve(n_points, random_state=0)
x, col = datasets.make_swiss_roll(n_points, noise=0.05)
X1, X2, X3 = X.T
# Create figure
fig = plt.figure(figsize=(8, 8))
# Add 3d scatter plot
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
ax.view_init(4, -72)
plt.show()
# Create figure
fig = plt.figure(figsize=(8, 8))
# Add 3d scatter plot
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=col, cmap=plt.cm.Spectral)
ax.view_init(4, -72)
plt.show()
plt.plot(X1, X3, 'ro')
plt.show()
print(normalized_mutual_info_score(X1, X3), pearsonr(X1, X3))
1.0 (-0.08697318258913389, 0.005921261651160034)
def processNegVals(x):
x = np.array(x)
minx = np.min(x)
if minx < 0:
x = x + abs(minx)
x = x + 0.000001
px = x/np.sum(x)
return px
print(entropy(processNegVals(X1), processNegVals(X3)))
0.9820849486320271
def KL(P, Q):
epsilon = 0.00001
P = processNegVals(P)
Q = processNegVals(Q)
divergence = np.sum(P*np.log(P/Q))
return divergence
print(KL(X1, X3))
0.9820849486320271
plt.plot(x[:, 0], x[:, 2], 'ro')
plt.show()
print(KL(x[:, 0], x[:, 2]))
0.5714407581920489
print(entropy(processNegVals(x[:, 0]), processNegVals(x[:, 2])))
0.5730153281109394
import scipy
vec = scipy.special.rel_entr(processNegVals(x[:, 0]), processNegVals(x[:, 2]))
kl_div = np.sum(vec)
print(kl_div)
0.5184330437935203
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples,
factor=.5,
noise=.05)
c1, c2 = noisy_circles[0][:, 0], noisy_circles[0][:, 1]
plt.plot(c1, c2, 'ro')
plt.show()
print(KL(c1, c2), entropy(processNegVals(c1), processNegVals(c2)), pearsonr(c1, c2))
0.3763193691703814 0.3763193691703816 (3.7145624444947286e-05, 0.9988530856054932)
noisy_moons = datasets.make_moons(n_samples, noise=.05)
m1, m2 = noisy_moons[0][:, 0], noisy_moons[0][:, 1]
plt.plot(m1, m2, 'ro')
plt.show()
n_samples = 1000
de_linearize = lambda X: np.cos(1.5 * np.pi * X) + np.cos(5 * np.pi * X)
X = np.sort(np.random.rand(n_samples)) * 2
y = de_linearize(X) + np.random.randn(n_samples) * 0.1
plt.plot(X, y)
plt.show()
print(KL(X, y), entropy(processNegVals(X), processNegVals(y)), pearsonr(X, y))
0.3988967088040082 0.3988967088040082 (-0.050574762504227044, 0.10996799031076543)