from scipy import linalg
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Colormap
cmap = colors.LinearSegmentedColormap('red_blue_classes',{'red':[(0,1,1),(1,0.7,0.7)],'green':[(0,0.7,0.7),(1,0.7,0.7)],'blue':[(0,0.7,0.7),(1,1,1)]})
plt.cm.register_cmap(cmap=cmap)# Generate datasetsdefdataset_fixed_cov():# generate 2 Gaussians samples with the same covariance matrix
n, dim =300,2
np.random.seed(0)
C = np.array([[0.,-0.23],[0.83,.23]])
X = np.r_[np.dot(np.random.randn(n, dim), C),
np.dot(np.random.randn(n, dim), C)+ np.array([1,1])]# 使用np.hstack()进行数据拼接:产生n行[0,1]的数据
y = np.hstack((np.zeros(n), np.ones(n)))return X, y
defdataset_cov():# Generate 2 Gaussians samples with different covariance matrices
n, dim =300,2
np.random.seed(0)
C = np.array([[0.,-1.],[2.5,.7]])*2.
X = np.r_[np.dot(np.random.randn(n, dim), C),
np.dot(np.random.randn(n, dim), C.T)+ np.array([1,4])]
y = np.hstack((np.zeros(n), np.ones(n)))return X, y
defplot_data(lda, X, y, y_pred, fig_index):
splot = plt.subplot(2,2, fig_index)if fig_index ==1:
plt.title("LDA")
plt.ylabel("Data with \n fixed covariance")if fig_index ==2:
plt.title("QDA")if fig_index ==3:
plt.ylabel("Data with \n varying covariance")
tp =(y == y_pred)# True positive
tp0, tp1 = tp[y ==0], tp[y ==1]
X0, X1 = X[y ==0], X[y ==1]
X0_tp, X0_fp = X0[tp0], X0[~tp0]
X1_tp, X1_fp = X1[tp1], X1[~tp1]# Class 0: dots
plt.scatter(X0_tp[:,0], X0_tp[:,1], marker='.', color='red')
plt.scatter(X0_fp[:,0], X0_tp[:,1], marker='x', s=20,color='#990000')# dark red# Class 1: dots
plt.scatter(X1_tp[:,0], X1_tp[:,1], marker='.', color='blue')
plt.scatter(X1_fp[:,0], X1_tp[:,1], marker='x', s=20, color='#000099')# dark blue# class 0 and 1: areas
nx, ny =200,100
x_min, x_max = plt.xlim()
y_min, y_max = plt.ylim()
xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),
np.linspace(y_min, y_max))# predict_proba(): returns a probabilities of a classification label
Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
Z = Z[:,1].reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap="red_blue_classes",
norm=colors.Normalize(0.,1.), zorder=0)
plt.contour(xx, yy, Z,[0.5],linewidths=2., colors="white")
plt.plot(lda.means_[0][0], lda.means_[0][1],'*', color="yellow", markersize=15, markeredgecolor="grey")
plt.plot(lda.means_[1][0], lda.means_[1][1],'*', color="yellow", markersize=15, markeredgecolor="grey")return splot
defplot_ellipse(splot, mean, cov, color):
v, w = linalg.eigh(cov)
u = w[0]/ linalg.norm(w[0])
angle = np.arctan(u[1]/ u[0])
angle =180* angle / np.pi # convert to degrees# filled Gaussian at 2 standard deviation
ell = mpl.patches.Ellipse(mean,2* v[0]**0.5,2* v[1]**0.5,180+ angle, facecolor=color,
edgecolor='black', linewidth=2)
ell.set_clip_box(splot.bbox)
ell.set_alpha(0.2)
splot.add_artist(ell)
splot.set_xticks(())
splot.set_yticks(())defplot_lda_cov(lda, splot):
plot_ellipse(splot, lda.means_[0], lda.covariance_,'red')
plot_ellipse(splot, lda.means_[1], lda.covariance_,'blue')defplot_qda_cov(qda, splot):
plot_ellipse(splot, qda.means_[0], qda.covariance_[0],'red')
plot_ellipse(splot, qda.means_[1], qda.covariance_[1],'blue')
plt.figure(figsize=(10,8), facecolor="white")
plt.suptitle("LDA vs QDA", y=0.98, fontsize=15)for i,(X,y)inenumerate([dataset_fixed_cov(), dataset_cov()]):# LDA
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
y_pred = lda.fit(X, y).predict(X)
splot = plot_data(lda, X, y, y_pred, fig_index=2*i+1)
plot_lda_cov(lda, splot)
plt.axis('tight')# QDA
qda = QuadraticDiscriminantAnalysis(store_covariance=True)
y_pred = qda.fit(X, y).predict(X)
splot = plot_data(qda, X, y, y_pred, fig_index=2*i+2)
plot_qda_cov(qda, splot)# 'tight': Set limits just large enough to show all data, then disable further autoscaling
plt.axis('tight')
plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()
6. Decision tree structure
决策树的相关概念
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0)
clf.fit(X_train, y_train)
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack =[(0,0)]# start with the root node id(0) and its depth(0)whilelen(stack)>0:# 'pop' ensure each node is only visited once
node_id, depth = stack.pop()
node_depth[node_id]= depth
# If the left and right child of a node is not the same we have a split node
is_split_node = children_left[node_id]!= children_right[node_id]# If a split node, append left and right children and depth to 'stack'# so we can loop through themif is_split_node:
stack.append((children_left[node_id], depth+1))
stack.append((children_right[node_id], depth+1))else:
is_leaves[node_id]=True# print("" "")字符串会自动拼接print("The binary tree structure has {n} nodes and has the following tree structure:\n".format(n=n_nodes))for i inrange(n_nodes):if is_leaves[i]:print("{space}node={node} is a leaf node.".format(space=node_depth[i]*"\t", node=i))else:print("{space}node={node} is a split node: ""go to node {left} if X[:,{feature}] <= {threshold}""else to node {right}.".format(
space=node_depth[i]*"\t",
node=i,
left=children_left[i],
feature=feature[i],
threshold=threshold[i],
right=children_right[i]))
from sklearn.datasets._samples_generator import make_blobs
import matplotlib.pyplot as plt
import numpy as np
X, y = make_blobs(n_samples=200, centers=2, random_state=0, cluster_std=0.80)
plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap="autumn")
plt.plot([0.6],[2.1],'x', color='red', markeredgewidth=2, markersize=10)
xfit = np.linspace(-1,3.5)# for m,b in [(1,0.65), (0.5, 1.6), (-0.2, 2.9)]:# plt.plot(xfit, m * xfit + b, '-k')# plt.show()
plt.xlim(-1,3.5)# 训练一个基本的SVMfrom sklearn.svm import SVC
# C=1E6:代表softmargin的容错误差。# 当C趋近于无穷大时,意味着分类严格不能有错误;# 当C趋近于很小的时,意味着分类可以有更大的错误容忍
model = SVC(kernel='linear',C=1)# model = SVC(kernel='rbf',C=1)
model.fit(X, y)# 绘图函数defplot_svc_decision_function(model, ax =None, plot_support=True):if ax isNone:# plt.gca(): get the current axis()
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()# create grid to evaluate model
x = np.linspace(xlim[0], xlim[1],30)
y = np.linspace(ylim[0], ylim[1],30)
Y, X = np.meshgrid(y, x)
xy = np.vstack([X.ravel(), Y.ravel()]).T
P = model.decision_function(xy).reshape(X.shape)# 画出大小为-1, 0, 1的三条线
ax.contour(X, Y, P, colors='k',
levels=[-1,0,1], alpha=0.5,
linestyle=['--','-','--'])# plot support vectorif plot_support:
ax.scatter(model.support_vectors_[:,0],
model.support_vectors_[:,1],
s=300, linewidth=1, facecolors='green')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plot_svc_decision_function(model)# ================================================================================# 使用更多的数据点defplot_svm(N=10, ax=None):
X, y = make_blobs(n_samples=200, centers=2, random_state=0, cluster_std=0.60)
X = X[:N]
y = y[:N]
model = SVC(kernel='linear', C=1E10)
model.fit(X,y)
ax = ax or plt.gca()
ax.scatter(X[:,0], X[:,1], c=y, s=50, cmap="autumn")
ax.set_xlim(-1,4)
ax.set_ylim(-1,6)
plot_svc_decision_function(model, ax)# 画多张图,使用plt.subplots(M,N,figsize(W,H)),声明有多少行多少列的图像,整个figure的大小
fig, ax = plt.subplots(1,2, figsize=(16,6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)# ax具有iteration的内嵌函数,list也具有iteration的内嵌函数# zip将二者打包成zip对象for axi, N inzip(ax,[60,120]):
plot_svm(N, axi)
axi.set_title('N={}'.format(N))# ================================================================================# 引入核方法的SVMfrom sklearn.datasets._samples_generator import make_circles
from mpl_toolkits import mplot3d
X, y = make_circles(100, factor=.1, noise=.1)# 引入新的维度r
r = np.exp(-(X **2).sum(1))defplot_3D(elev=30, azim=30, X=X, y=y):# python的plt函数画3D图像r
ax = plt.subplot(projection='3d')
ax.scatter3D(X[:,0], X[:,1], r, c=y, s=50, cmap='autumn')
ax.view_init(elev=elev, azim=azim)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('r')
plot_3D(elev=45, azim=45, X=X, y=y)# C=1E6:代表softmargin的容错误差。# 当C趋近于无穷大时,意味着分类严格不能有错误;# 当C趋近于很小的时,意味着分类可以有更大的错误容忍
clf = SVC(kernel='rbf', C=1E6)
clf.fit(X, y)# plt.scatter(X[:,0], X[:,1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)
plt.show()
Metroplis sampling
from scipy.stats import norm
import matplotlib.pyplot as plt
import random
defnorm_dist_prob(theta):
y = norm.pdf(theta, loc=3, scale=2)return y
T =5000
pi =[0for i inrange(T)]
sigma =1
t =0while t < T-1:
t = t +1
pi_star = norm.rvs(loc=pi[t -1], scale=sigma, size=1, random_state=None)#状态转移进行随机抽样
alpha =min(1,(norm_dist_prob(pi_star[0])/ norm_dist_prob(pi[t -1])))#alpha值
u = random.uniform(0,1)if u < alpha:
pi[t]= pi_star[0]else:
pi[t]= pi[t -1]
plt.scatter(pi, norm.pdf(pi, loc=3, scale=2),label='Target Distribution')
num_bins =50
plt.hist(pi, num_bins,color="green", density=True,alpha=0.7,label='Samples Distribution')
plt.legend()
plt.show()
Metroplis-Hasting
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
mu =3
sigma =10# 转移概率defq(x):return np.exp(-(x-mu)**2/(sigma**2))# 按照转移矩阵Q生成样本defqsample():return np.random.normal(mu, sigma)# 目标分布函数p(x)defp(x):# return 0.3*np.exp(-(x-0.3)**2 + 0.7*np.exp(-(x-2)**2/0.3))return np.sqrt(x*(1-x))* np.sin((2.1* np.pi)/(x+0.05))defmcmcsample(n =20000):
sample = np.zeros(n)
sample[0]=0.5#Initializationfor i inrange(n-1):
qs = qsample()# 从转移矩阵Q(x)得到样本xt
u = np.random.uniform(0,1)
alpha_ij =(p(qs)*q(sample[i])/(p(sample[i]* qs)))if u <min(alpha_ij,1):
sample[i+1]= qs
else:
sample[i+1]= sample[i]return sample
x = np.arange(0,1,0.001)
realdata = p(x)
sampledata = mcmcsample()# lw: linewidth
plt.plot(x, realdata,'r', lw=1)# 理想数据
plt.plot(x, q(x),'b', lw=1)# Q(x)转移矩阵的数据# plt.hist(sampledata, bins=x, fc='c', density=True)
plt.show()
Generalized Linear Models, and Poisson loss for gradient boostingimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import PoissonRegressorfrom sklearn.experimental import enable_hist_gradient_boostingfrom