datawahle代码地址
参考周志华《机器学习》
1. 朴素贝叶斯算法
朴素贝叶斯代码实现
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
from sklearn.datasets import make_blobs
# make_blobs:为聚类产生数据集
# n_samples:样本点数,n_features:数据的维度,centers:产生数据的中心点,默认值3
# cluster_std:数据集的标准差,浮点数或者浮点数序列,默认值1.0,random_state:随机种子
X, y = make_blobs(n_samples = 100, n_features=2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
plt.show()
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
rng = np.random.RandomState(0)
X_test = [-6, -14] + [14, 18] * rng.rand(2000, 2)
y_pred = model.predict(X_test)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, s=20, cmap='RdBu', alpha=0.1)
plt.axis(lim)
plt.show()
yprob = model.predict_proba(X_test)
yprob[-8:].round(2)
2. EM(Expectation-Maximization)算法
1. numpy实现EM算法
本代码用于模拟k=2个正态分布的均值估计。其中ini_data(Sigma,Mu1,Mu2,k,N)函数用于生成训练样本,此训练样本时从两个高斯分布中随机生成的,其中高斯分布a均值Mu1=40、均方差Sigma=6,高斯分布b均值Mu2=20、均方差Sigma=6,生成的样本分布如下图所示。由于本问题中实现无法直接冲样本数据中获知两个高斯分布参数,因此需要使用EM算法估算出具体Mu1、Mu2取值。
# -*- coding: utf-8 -*-
import numpy as np
import math
import copy
import matplotlib.pyplot as plt
isdebug = True
# 参考文献:机器学习TomM.Mitchell P.137
# 代码参考http://blog.csdn.net/chasdmeng/article/details/38709063
# 指定k个高斯分布参数,这里指定k=2。注意2个高斯分布具有相同均方差Sigma,均值分别为Mu1,Mu2。
def init_data(Sigma,Mu1,Mu2,k,N):
global X
global Mu
global Expectations
X = np.zeros((1,N))
Mu = np.random.random(k)
Expectations = np.zeros((N,k))
for i in range(0,N):
if np.random.random(1) > 0.5:
X[0,i] = np.random.normal(Mu1, Sigma)
else:
X[0,i] = np.random.normal(Mu2, Sigma)
if isdebug:
print("***********")
print("初始观测数据X:")
print(X )
# EM算法:步骤1,计算E[zij]
def e_step(Sigma, k, N):
global Expectations
global Mu
global X
for i in range(0,N):
Denom = 0
Numer = [0.0] * k
for j in range(0,k):
Numer[j] = math.exp((-1/(2*(float(Sigma**2))))*(float(X[0,i]-Mu[j]))**2)
Denom += Numer[j]
for j in range(0,k):
Expectations[i,j] = Numer[j] / Denom
if isdebug:
print("***********")
print("隐藏变量E(Z):")
print(Expectations)
# EM算法:步骤2,求最大化E[zij]的参数Mu
def m_step(k,N):
global Expectations
global X
for j in range(0,k):
Numer = 0
Denom = 0
for i in range(0,N):
Numer += Expectations[i,j]*X[0,i]
Denom +=Expectations[i,j]
Mu[j] = Numer / Denom
# 算法迭代iter_num次,或达到精度Epsilon停止迭代
def run(Sigma,Mu1,Mu2,k,N,iter_num,Epsilon):
init_data(Sigma,Mu1,Mu2,k,N)
print("初始<u1,u2>:", Mu)
for i in range(iter_num):
Old_Mu = copy.deepcopy(Mu)
e_step(Sigma,k,N)
m_step(k,N)
print(i,Mu)
if sum(abs(Mu - Old_Mu)) < Epsilon:
break
if __name__ == '__main__':
sigma = 6 # 高斯分布具有相同的方差
mu1 = 40 # 第一个高斯分布的均值 用于产生样本
mu2 = 20 # 第二个高斯分布的均值 用于产生样本
k = 2 # 高斯分布的个数
N = 1000 # 样本个数
iter_num = 1000 # 最大迭代次数
epsilon = 0.0001 # 当两次误差小于这个时退出
run(sigma,mu1,mu2,k,N,iter_num,epsilon)
plt.hist(X[0,:],50)
plt.show()
2. sklearn实现EM算法
1. EM算法实现
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances_argmin
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
if __name__ == '__main__':
#style = 'sklearn'
style = 'myself'
np.random.seed(0)
mu1_fact = (0, 0, 0)
cov1_fact = np.diag((1, 2, 3))
# 根据实际情况生成一个多元正态分布矩阵,np.random.multivariate_normal
# 参数就是高斯分布所需的均值与方差
# 第一个参数: mean:mean是多维分布的均值维度为1;
# 第二个参数:cov:协方差矩阵,注意:协方差矩阵必须是对称的且需为半正定矩阵;
# 第三个参数:size:指定生成的正态分布矩阵的维度
data1 = np.random.multivariate_normal(mu1_fact, cov1_fact, 400)
print('data1 shape: {0}'.format(data1.shape))
mu2_fact = (2, 2, 1)
# 方差对称且正定(positive-semidefinite): (4, 1, 3), (1, 2, 1), (3, 1, 4)
cov2_fact = np.array(((4, 1, 3), (1, 2, 1), (3, 1, 4)))
data2 = np.random.multivariate_normal(mu2_fact, cov2_fact, 100)
print('data2 shape: {0}'.format(data2.shape))
data = np.vstack((data1, data2))
print('data shape: {0}'.format(data.shape))
y = np.array([True] * 400 + [False] * 100)
if style == 'sklearn':
g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000)
g.fit(data)
print('类别概率:\t', g.weights_[0])
print('均值:\n', g.means_, '\n')
print('方差:\n', g.covariances_, '\n')
mu1, mu2 = g.means_
sigma1, sigma2 = g.covariances_
else:
num_iter = 100
n, d = data.shape
# 随机指定
# mu1 = np.random.standard_normal(d)
# print mu1
# mu2 = np.random.standard_normal(d)
# print mu2
mu1 = data.min(axis=0)
mu2 = data.max(axis=0)
# 创建d行d列的单位矩阵(对角线为1,其余为0)
sigma1 = np.identity(d)
sigma2 = np.identity(d)
pi = 0.5
# EM
for i in range(num_iter):
# E Step
# 通过初始化的均值与方差,做多元的正态分布
norm1 = multivariate_normal(mu1, sigma1)
norm2 = multivariate_normal(mu2, sigma2)
# 概率密度 * pi
tau1 = pi * norm1.pdf(data)
tau2 = (1 - pi) * norm2.pdf(data)
gamma = tau1 / (tau1 + tau2)
# M Step
mu1 = np.dot(gamma, data) / np.sum(gamma)
mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma))
sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma)
sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma)
pi = np.sum(gamma) / n
print(i, ":\t", mu1, mu2)
print('类别概率:\t', pi)
print('均值:\t', mu1, mu2)
print('方差:\n', sigma1, '\n\n', sigma2, '\n')
# 预测分类
# multivariate_normal获得多元正态分布
norm1 = multivariate_normal(mu1, sigma1)
norm2 = multivariate_normal(mu2, sigma2)
# pdf: Probability density function,连续性概率分布函数
tau1 = norm1.pdf(data)
tau2 = norm2.pdf(data)
fig = plt.figure(figsize=(10, 5), facecolor='w')
ax = fig.add_subplot(121, projection='3d')
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', edgecolors='k', depthshade=True)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('原始数据', fontsize=15)
ax = fig.add_subplot(122, projection='3d')
# 求取点距离
order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean')
# order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='cosine')
# 通过欧式距离,将点分为两类
print(order)
if order[0] == 0:
c1 = tau1 > tau2
else:
c1 = tau1 < tau2
c2 = ~c1
# 机器学习计算准确率的常用做法
# 原理:真实值是y,预测值是c1,相等则为True,否则为False。True为1,False为0
# 求均值则为:预测准确的数目/总数目,这不就是准确率么
acc = np.mean(y == c1)
print('准确率:%.2f%%' % (100*acc))
ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', edgecolors='k', depthshade=True)
ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', edgecolors='k', depthshade=True)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('EM算法分类', fontsize=15)
plt.suptitle('EM算法的实现', fontsize=18)
plt.subplots_adjust(top=0.90)
# plt.tight_layout()
plt.show()
2. GMM与DPGMM比较
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
def expand(a, b, rate=0.05):
d = (b - a) * rate
return a-d, b+d
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
if __name__ == '__main__':
np.random.seed(0)
cov1 = np.diag((1, 2))
N1 = 500
N2 = 300
N = N1 + N2
x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
m = np.array(((1, 1), (1, 3)))
x1 = x1.dot(m)
x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
x = np.vstack((x1, x2))
y = np.array([0]*N1 + [1]*N2)
n_components = 3
# 绘图使用
colors = '#A0FFA0', '#2090E0', '#FF8080'
cm = mpl.colors.ListedColormap(colors)
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
grid_test = np.stack((x1.flat, x2.flat), axis=1)
plt.figure(figsize=(6, 6), facecolor='w')
plt.suptitle('GMM/DPGMM比较', fontsize=15)
ax = plt.subplot(211)
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
gmm.fit(x)
centers = gmm.means_
covs = gmm.covariances_
print('GMM均值 = \n', centers)
print('GMM方差 = \n', covs)
y_hat = gmm.predict(x)
grid_hat = gmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')
clrs = list('rgbmy')
for i, (center, cov) in enumerate(zip(centers, covs)):
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.title('GMM', fontsize=15)
plt.grid(b=True, ls=':', color='#606060')
# DPGMM
dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
dpgmm.fit(x)
centers = dpgmm.means_
covs = dpgmm.covariances_
print('DPGMM均值 = \n', centers)
print('DPGMM方差 = \n', covs)
y_hat = dpgmm.predict(x)
print(y_hat)
ax = plt.subplot(212)
grid_hat = dpgmm.predict(grid_test)
grid_hat = grid_hat.reshape(x1.shape)
plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')
for i, cc in enumerate(zip(centers, covs)):
if i not in y_hat:
continue
center, cov = cc
value, vector = sp.linalg.eigh(cov)
width, height = value[0], value[1]
v = vector[0] / sp.linalg.norm(vector[0])
angle = 180* np.arctan(v[1] / v[0]) / np.pi
e = Ellipse(xy=center, width=width, height=height,
angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
ax.add_artist(e)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.title('DPGMM', fontsize=15)
plt.grid(b=True, ls=':', color='#606060')
plt.tight_layout(2, rect=(0, 0, 1, 0.95))
plt.show()